Skip to content

Commit 2acf892

Browse files
authored
Merge pull request #43 from JuliaMath/float32
Take Float32 a little more seriously
2 parents dca815d + 0c356ea commit 2acf892

File tree

12 files changed

+299
-176
lines changed

12 files changed

+299
-176
lines changed

NEWS.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@ For bug fixes, performance enhancements, or fixes to unexported functions we wil
1212

1313
# Version 0.2.0
1414

15+
### Added
16+
- Add more optimized methods for Float32 calculations that are faster([PR #43](https://github.com/JuliaMath/Bessels.jl/pull/43))
17+
18+
### Fixed
19+
- Reduce compile time and time to first call of besselj and bessely ([PR #42](https://github.com/JuliaMath/Bessels.jl/pull/42))
20+
1521
### Added
1622
- add an unexport method (`Bessels.besseljy(nu, x)`) for faster computation of `besselj` and `bessely` (#33)
1723
- add exported methods for Hankel functions `besselh(nu, k, x)`, `hankelh1(nu, x)`, `hankelh2(nu, x)` (#33)

src/Float128/besselj.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
function besselj0(x::BigFloat)
1+
function _besselj0(x::BigFloat)
22
x = abs(x)
33
T = eltype(x)
44
if iszero(x)

src/U_polynomials.jl

Lines changed: 95 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,19 @@
1414
# [2] http://dlmf.nist.gov/10.41.E9
1515
# [3] https://dlmf.nist.gov/10.41
1616

17+
#####
18+
##### Large order expansion for J_{nu}(x) and Y_{nu}(x) and v > x
19+
#####
20+
1721
"""
1822
besseljy_debye(nu, x::T)
1923
2024
Debey's asymptotic expansion for large order valid when v-> ∞ and x < v.
2125
Returns both besselj and bessely
2226
"""
23-
function besseljy_debye(v, x)
24-
T = eltype(x)
27+
function besseljy_debye(v, x::T) where T
2528
S = promote_type(T, Float64)
26-
x = S(x)
29+
v, x = S(v), S(x)
2730

2831
vmx = (v + x) * (v - x)
2932
vs = sqrt(vmx)
@@ -36,12 +39,47 @@ function besseljy_debye(v, x)
3639
p = v / vs
3740
p2 = v^2 / vmx
3841

39-
Uk_Jn, Uk_Yn = Uk_poly_Jn(p, v, p2, x)
40-
42+
Uk_Jn, Uk_Yn = Uk_poly_Jn(p, v, p2, T(x))
4143
return coef_Jn * Uk_Jn, coef_Yn * Uk_Yn
4244
end
4345

44-
besseljy_debye_cutoff(nu, x) = nu > 2.0 + 1.00035*x + Base.Math._approx_cbrt(Float64(302.681)*x) && nu > 15
46+
# Cutoffs for besseljy_debye expansions
47+
# regions where the debye expansions for large orders v > x are valid
48+
# determined by fitting a curve a + bx + (cx)^(1/3) to where debye expansions provide desired precision
49+
50+
# Float32
51+
besseljy_debye_fit(x::Float32) = 2.5f0 + 1.00035f0*x + 7.114f0*Base.Math._approx_cbrt(x)
52+
besseljy_debye_cutoff(nu, x::Float32) = nu > besseljy_debye_fit(x) && nu > 6
53+
54+
# Float64
55+
besseljy_debye_fit(x::Float64) = 2.0 + 1.00035*x + 6.714*Base.Math._approx_cbrt(x)
56+
besseljy_debye_cutoff(nu, x::Float64) = nu > besseljy_debye_fit(x) && nu > 15
57+
58+
# Float128 - provide roughly ~1e-35 precision
59+
#besseljy_debye_fit(x) = 16.0 + 1.0012*x + Base.Math._approx_cbrt(Float64(27.91*x))
60+
#besseljy_debye_cutoff(nu, x) = nu > besseljy_debye_fit(x) && nu > 40
61+
62+
#####
63+
##### Debye large order expansion coefficients
64+
#####
65+
66+
function Uk_poly_Jn(p, v, p2, x::Float64)
67+
if v > 5.0 + 1.00033*x + 11.26*Base.Math._approx_cbrt(x)
68+
return Uk_poly10(p, v, p2)
69+
else
70+
return Uk_poly20(p, v, p2)
71+
end
72+
end
73+
Uk_poly_Jn(p, v, p2, x::Float32) = Uk_poly5(p, v, p2)
74+
75+
Uk_poly_In(p, v, p2, ::Type{Float32}) = Uk_poly5(p, v, p2)[1]
76+
Uk_poly_In(p, v, p2, ::Type{Float64}) = Uk_poly10(p, v, p2)[1]
77+
Uk_poly_Kn(p, v, p2, ::Type{Float32}) = Uk_poly5(p, v, p2)[2]
78+
Uk_poly_Kn(p, v, p2, ::Type{Float64}) = Uk_poly10(p, v, p2)[2]
79+
80+
#####
81+
##### Large order expansion for Hankel functions and x > v
82+
#####
4583

4684
"""
4785
hankel_debye(nu, x::T)
@@ -51,7 +89,7 @@ Return the Hankel function H(nu, x) = J(nu, x) + Y(nu, x)*im
5189
"""
5290
function hankel_debye(v, x::T) where T
5391
S = promote_type(T, Float64)
54-
x = S(x)
92+
v, x = S(v), S(x)
5593

5694
vmx = abs((v + x) * (x - v))
5795
vs = sqrt(vmx)
@@ -63,55 +101,39 @@ function hankel_debye(v, x::T) where T
63101
p = v / vs
64102
p2 = v^2 / vmx
65103

66-
_, Uk_Yn = Uk_poly_Hankel(p*im, v, -p2, x)
67-
104+
_, Uk_Yn = Uk_poly_Hankel(p*im, v, -p2, T(x))
68105
return coef_Yn * Uk_Yn
69106
end
70107

71-
hankel_debye_cutoff(nu, x) = nu < 0.2 + x + Base.Math._approx_cbrt(-411*x)
108+
# Cutoffs for hankel_debye expansions
109+
# regions where the debye expansions for large orders x > v are valid
110+
# determined by fitting a curve a + x + (bx)^(1/3) to where debye expansions provide desired precision
111+
112+
# Float32
113+
hankel_debye_fit(x::Float32) = -3.5f0 + x + 7.435f0*Base.Math._approx_cbrt(-x)
114+
hankel_debye_cutoff(nu, x::Float32) = nu < hankel_debye_fit(x)
115+
116+
# Float64
117+
hankel_debye_fit(x::Float64) = 0.2 + x + 7.435*Base.Math._approx_cbrt(-x)
118+
hankel_debye_cutoff(nu, x::Float64) = nu < hankel_debye_fit(x)
119+
120+
# Float128
121+
#hankel_debye_cutoff(nu, x) = nu < -2 + 0.9987*x + Base.Math._approx_cbrt(-21570.3*Float64(x))
72122

73-
function Uk_poly_Jn(p, v, p2, x::T) where T <: Float64
74-
if v > 5.0 + 1.00033*x + Base.Math._approx_cbrt(1427.61*x)
75-
return Uk_poly10(p, v, p2)
76-
else
77-
return Uk_poly20(p, v, p2)
78-
end
79-
end
80123
function Uk_poly_Hankel(p, v, p2, x::T) where T <: Float64
81-
if v < 5.0 + 0.998*x + Base.Math._approx_cbrt(-1171.34*x)
124+
if v < 5.0 + 0.998*x + 10.541*Base.Math._approx_cbrt(-x)
82125
return Uk_poly10(p, v, p2)
83126
else
84127
return Uk_poly20(p, v, p2)
85128
end
86129
end
87-
Uk_poly_Hankel(p, v, p2, x) = Uk_poly_Jn(p, v, p2, x::BigFloat)
88-
89-
Uk_poly_In(p, v, p2, ::Type{Float32}) = Uk_poly5(p, v, p2)[1]
90-
Uk_poly_In(p, v, p2, ::Type{Float64}) = Uk_poly10(p, v, p2)[1]
91-
Uk_poly_Kn(p, v, p2, ::Type{Float32}) = Uk_poly5(p, v, p2)[2]
92-
Uk_poly_Kn(p, v, p2, ::Type{Float64}) = Uk_poly10(p, v, p2)[2]
93130

94-
@inline function split_evalpoly(x, P)
95-
# polynomial P must have an even number of terms
96-
N = length(P)
97-
xx = x*x
131+
Uk_poly_Hankel(p, v, p2, x::Float32) = Uk_poly5(p, v, p2)
132+
Uk_poly_Hankel(p, v, p2, x) = Uk_poly_Jn(p, v, p2, x)
98133

99-
out = P[end]
100-
out2 = P[end-1]
101-
102-
for i in N-2:-2:2
103-
out = muladd(xx, out, P[i])
104-
out2 = muladd(xx, out2, P[i-1])
105-
end
106-
if iszero(rem(N, 2))
107-
out *= x
108-
return out2 - out, out2 + out
109-
else
110-
out = muladd(xx, out, P[1])
111-
out2 *= x
112-
return out - out2, out2 + out
113-
end
114-
end
134+
#####
135+
##### U - polynomials
136+
#####
115137

116138
function Uk_poly5(p, v, p2)
117139
u0 = 1.0
@@ -169,7 +191,8 @@ function Uk_poly20(p, v, p2)
169191
return split_evalpoly(-p/v, Poly)
170192
end
171193

172-
function Uk_poly_Jn(p, v, p2, x::T) where T <: BigFloat
194+
# implementation for arbitrary precision
195+
function Uk_poly_Jn(p, v, p2, x::T) where T
173196
u0 = one(T)
174197
u1 = evalpoly(p2, (3, -5)) / 24
175198
u2 = evalpoly(p2, (81, -462, 385)) / 1152
@@ -192,14 +215,35 @@ function Uk_poly_Jn(p, v, p2, x::T) where T <: BigFloat
192215
u19 = evalpoly(p2, (3761719809509744584195141470215239968860161850335693359375, -1694136104847790923543013061207680485991618397125797873046875, 131518243789528012257323287684549590712219590887856743595703125, -4179129511260217209133623104486559148268490002722848244991240625, 72088266652871136165181276891337618088740053757385007292240377500, -776773977214820033621339638022401758862209813648991378355261599500, 5677394779818071523358076045292335690018332546439714641415150037636, -29667822591847140970922062603154078948366431649274897672067206014580, 114800233558787974267088388638654159779991991716667850063043161177890, -336788945225975997668966486515468748933754942064781256509404101275850, 760599837839891632010677514872412397427959022966733699428711208643750, -1333776105497652608917805362422659621440699261908512326717363285968750, 1821026790520428617034899967974569000414933301802005578083458082187500, -1929493390007550512825649545883182667978851358936168551128507440937500, 1570570304135828069768211937927131121377451398878663909061910457812500, -963355744456233323886326422779275308259891167457840027230510476562500, 430744376085805585076333464506126217301082462279781595185335615234375, -132496442776420617057548516091451843431549350710841121872822607421875, 25067118709566753991500713640672585065184296412786618544560205078125, -2198870062242697718552694179006367110981078632700580574084228515625)) / 980570799589976236952265721984567958568960000000
193216
u20 = evalpoly(p2, (24030618487110150352755402740028995969072485932314476318359375, -11984379509393886990049682627416290126645799829432886859195312500, 1028816773596376675865176501621547688509187479681533744365175781250, -36136687211104276266628386141898079997731413843884113675698994212500, 689368394712060288513879526213143279387995331221181709158682715671875, -8226054591925395046897310265711439061232478740113589342810359827971600, 66729727980314206345594148553023187689855688854650386958448499394886808, -388235990422199036481157075090292917209702371811112714609151030385239376, 1679621555358289731717827244171539003686077073065087747585589973854567950, -5539024239213671573375139503069183935283937579486175606155069574939719000, 14158993985687610069291256086138044030538819617545349322200892501730615500, -28351273454978996507857547213496220444080209057761228571819950144231575000, 44700502703654649774362294361156754154712937287563245669685024068473468750, -55504285315413734114196925709059066734450069228526494661323483726671250000, 53996017865021964397812742861916826936440170527545458081631955240159375000, -40677946447845849750014263157052100427921007881029648608310681741346250000, 23250401373415767366970579801426233116241475047289980901600427561701171875, -9744288621182737996606001891415854305391392962559389056448157541132812500, 2823768330714179467082676027577350697536971031741905897356560592675781250, -505537818270094147077012813306995849751437826286925078626556809570312500, 42128151522507845589751067775582987479286485523910423218879734130859375)) / 658943577324464031231922565173629668158341120000000
194217
u21 = evalpoly(p2, (1990919576929585163761247434580873723897677550573731281207275390625, -1094071724893410272201314846735800345180930752601602224440504638671875, 103359845691236916270005420886293061281368263471845448279855946394531250, -3993558675585831919973605001813276532326292885934464373884268722794718750, 83832389176247515253189196480455786065824463879054932596114704927571390625, -1101977288759423115916484463959767795703317649005495798822550381533967842875, 9865413224996821913093061266347613112205666995211786544733850490740903131000, -63509799534443193918054738326913581450607131662586232327271654588312820660616, 305080179205137176095342856930260393560550505786421683990077008259370104309410, -1122099959965657526344757894103766710826629020929459670948834078441128579791750, 3217185258543282976637373169047731813093578126603884759856059695249999306047500, -7276835259402589085458442196686990645669654847254510877738804991391058411412500, 13076651508858985557227319801010895818335803028865299751194038965212274849406250, -18719023753854332443081892087572754119280558462994056787647908433841920235468750, 21307327225910629020600045595173860611314592374884901403059587980149276271875000, -19156728115567236234371593788102013666866274144599851347429312225076676478125000, 13430107219321888006291381503763318985372222835071804983658005207838642173828125, -7186150593017474773099947110903785965879266917528290917207712073663895771484375, 2834031566467842832851805860262261718160136985649155528263587796394390332031250, -776308737033643856044315139790308583914612827783322139063211433790569824218750, 131897976398031751060811874321878385924211075364673046295410087596954345703125, -10468093364923154846096180501736379835254847251164527483762705364837646484375)) / 5456052820246562178600318839637653652351064473600000000
195-
u22 = evalpoly(p2, (3.8335346613939443e12, -2.3109159761323565e15, 2.3920280120269997e17, -1.0121818379942089e19, 2.3275346258089414e20, -3.3544689122226785e21, 3.297557757461478e22, -2.336107524486965e23, 1.238524103792452e24, -5.0463598652544e24, 1.6103128541137314e25, -4.077501349206541e25, 8.26258535798955e25, -1.3459193994556415e26, 1.7635713272326644e26, -1.8526731041549917e26, 1.548092083577385e26, -1.0148048982766395e26, 5.103920268388802e25, -1.9006807535664433e25, 4.936185283790662e24, -7.980021228256559e23, 6.04547062746709e22))
196-
u23 = -evalpoly(p2, (4.218971570284097e13, -2.778481101311081e16, 3.1385283211499996e18, -1.4486387749510863e20, 3.6341499869780876e21, -5.7179919065432055e22, 6.144339925144987e23, -4.766924608251481e24, 2.774466490672939e25, -1.2449342046124282e26, 4.392130563430048e26, -1.2355529146787609e27, 2.7982068996977173e27, -5.131998439010333e27, 7.641216535678268e27, -9.228395023257356e27, 8.999255845917453e27, -7.02322235515725e27, 4.322773732100187e27, -2.050902994929233e27, 7.234243234844319e26, -1.7860680966743495e26, 2.753863007576946e25, -1.9955529040412654e24))
197-
u24 = evalpoly(p2, (4.8540146868529006e14, -3.4792991439250445e17, 4.273207395701127e19, -2.1435653415108537e21, 5.844687629283339e22, -1.0000750138961727e24, 1.1699189691874474e25, -9.896648661695488e25, 6.29370256208713e26, -3.0939194683063286e27, 1.1998211967644424e28, -3.7252346341093444e28, 9.358117764887965e28, -1.9153963148099324e29, 3.206650343980748e29, -4.395132918078325e29, 4.9215508698387624e29, -4.4775348387950634e29, 3.277658265637452e29, -1.9012207767547338e29, 8.536184882279286e28, -2.8599776383548e28, 6.728957650918171e27, -9.916401268407057e26, 6.886389769727123e25))
198-
u25 = -evalpoly(p2, (5.827244631566907e15, -4.5305357275125955e18, 6.029638127487473e20, -3.2761234100445222e22, 9.675654883193622e23, -1.7941040647617987e25, 2.2764310713849358e26, -2.0914533474677945e27, 1.4471195817119858e28, -7.757785573404132e28, 3.2900927159291354e29, -1.1210232552135908e30, 3.1034661143911036e30, -7.036055338636485e30, 1.3128796688902614e31, -2.0208792587851872e31, 2.5653099826522344e31, -2.6771355605594045e31, 2.2823085118856488e31, -1.5730388076301427e31, 8.627355824571355e30, -3.676221426681414e30, 1.1728484268744769e30, -2.6355294419807464e29, 3.7195112743738626e28, -2.479674182915908e27))
199-
200-
Poly = (u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15, u16, u17, u18, u19, u20)
218+
Poly = (u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15, u16, u17, u18, u19, u20, u21)
201219
return split_evalpoly(-p/v, Poly)
202220
end
221+
222+
# performs a second order horner scheme for polynomial evaluation
223+
# computes the even and odd coefficients of the polynomial independently within a loop to reduce latency
224+
# splits the polynomial to compute both 1 + ax + bx^2 + cx^3 and 1 - ax + bx^2 - cx^3 ....
225+
@inline function split_evalpoly(x, P)
226+
# polynomial P must have an even number of terms
227+
N = length(P)
228+
xx = x*x
229+
230+
out = P[end]
231+
out2 = P[end-1]
232+
233+
for i in N-2:-2:2
234+
out = muladd(xx, out, P[i])
235+
out2 = muladd(xx, out2, P[i-1])
236+
end
237+
if iszero(rem(N, 2))
238+
out *= x
239+
return out2 - out, out2 + out
240+
else
241+
out = muladd(xx, out, P[1])
242+
out2 *= x
243+
return out - out2, out2 + out
244+
end
245+
end
246+
203247
#=
204248
u0 = one(x)
205249
u1 = p / 24 * (3 - 5*p^2) * -1 / v

src/asymptotics.jl

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@
1717
# Cutoffs were determined manually for each input type.
1818
# AbstractFloat cutoff gives relative error roughly for quadruple precision accuracy (1e-35)
1919

20-
#besseljy_large_argument_min(::Type{Float32}) = 15.0f0
21-
besseljy_large_argument_min(::Type{Float64}) = 20.0
22-
besseljy_large_argument_min(::Type{T}) where T <: AbstractFloat = 40.0
20+
besseljy_large_argument_min(x::Float32) = 15.0f0
21+
besseljy_large_argument_min(x::Float64) = 20.0
22+
#besseljy_large_argument_min(x) = 40.0
2323

24-
#besseljy_large_argument_cutoff(v, x::Float32) = (x > 1.2f0*v && x > besseljy_large_argument_min(Float32))
25-
besseljy_large_argument_cutoff(v, x::Float64) = (x > 1.65*v && x > besseljy_large_argument_min(Float64))
26-
besseljy_large_argument_cutoff(v, x::T) where T = (x > 4*v && x > besseljy_large_argument_min(T))
24+
besseljy_large_argument_cutoff(v, x::Float32) = (x > 1.2f0*v && x > besseljy_large_argument_min(x))
25+
besseljy_large_argument_cutoff(v, x::Float64) = (x > 1.65*v && x > besseljy_large_argument_min(x))
26+
#besseljy_large_argument_cutoff(v, x) = (x > 4*v && x > besseljy_large_argument_min(x))
2727

2828
"""
2929
besseljy_large_argument(nu, x::T)
@@ -32,26 +32,26 @@ Asymptotic expansions for large arguments valid when x > 1.6*nu and x > 20.0.
3232
Returns both (besselj(nu, x), bessely(nu, x)).
3333
"""
3434
function besseljy_large_argument(v, x::T) where T
35-
# gives both (besselj, bessely) for x > 1.6*v
3635
α, αp = _α_αp_asymptotic(v, x)
37-
b = SQ2OPI(T) / sqrt(αp * x)
36+
S = promote_type(T, Float64)
37+
v, x = S(v), S(x)
38+
b = SQ2OPI(S) / sqrt(αp * x)
3839

3940
# we need to calculate sin(x - v*pi/2 - pi/4) and cos(x - v*pi/2 - pi/4)
4041
# For improved accuracy this is expanded using the formula for sin(x+y+z)
42+
s, c = sincos(PIO2(S) * v)
43+
sα, cα = sincos(α)
4144

42-
S, C = sincos(PIO2(T) * v)
43-
Sα, Cα = sincos(α)
45+
CMS = c - s
46+
CPS = c + s
4447

45-
CMS = C - S
46-
CPS = C + S
48+
s1 = CMS *
49+
s2 = CPS *
4750

48-
s1 = CMS *
49-
s2 = CPS *
51+
s3 = CMS *
52+
s4 = CPS *
5053

51-
s3 = CMS *
52-
s4 = CPS *
53-
54-
return SQ2O2(T) * (s1 + s2) * b, SQ2O2(T) * (s3 - s4) * b
54+
return SQ2O2(S) * (s1 + s2) * b, SQ2O2(S) * (s3 - s4) * b
5555
end
5656

5757
# Float64
@@ -98,7 +98,30 @@ function _α_αp_asymptotic(v, x::Float64)
9898
return _α_αp_poly_30(v, x)
9999
end
100100
end
101-
101+
function _α_αp_asymptotic(v, x::Float32)
102+
v, x = Float64(v), Float64(x)
103+
if x > 4*v
104+
return _α_αp_poly_5(v, x)
105+
elseif x > 1.8*v
106+
return _α_αp_poly_10(v, x)
107+
else
108+
return _α_αp_poly_30(v, x)
109+
end
110+
end
111+
function _α_αp_poly_5(v, x::T) where T
112+
xinv = inv(x)^2
113+
μ = 4 * v^2
114+
s0 = one(T)
115+
s1 = (1 - μ) / 8
116+
s2 = evalpoly(μ, (-0.1953125, 0.203125, -0.0078125))
117+
s3 = evalpoly(μ, (1.0478515625, -1.1591796875, 0.1123046875, -0.0009765625))
118+
s4 = evalpoly(μ, (-11.466461181640625, 13.1358642578125, -1.71624755859375, 0.0469970703125, -0.000152587890625))
119+
s5 = evalpoly(μ, (211.27614974975586, -246.8455924987793, 37.067405700683594, -1.5151596069335938, 0.017223358154296875, -2.6702880859375e-5))
120+
121+
αp = evalpoly(xinv, (s0, s1, s2, s3, s4, s5))
122+
α = x * evalpoly(xinv, (s0, -s1, -s2/3, -s3/5, -s4/7, -s5/9))
123+
return α, αp
124+
end
102125
function _α_αp_poly_10(v, x::T) where T
103126
xinv = inv(x)^2
104127
μ = 4 * v^2

0 commit comments

Comments
 (0)