Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 21 additions & 67 deletions src/erf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,6 @@ function _erf(x::Float64)
ix::UInt32=reinterpret(UInt64,x)>>32
# # top 32, without sign bit
ia::UInt32=ix & 0x7fffffff
# # sign
# sign::UInt32=ix>>31

sign::Bool=x<0



if (ia < 0x3feb0000)
# a = |x| < 0.84375.
Expand All @@ -130,7 +124,6 @@ function _erf(x::Float64)
PA=(0.12837916709551256, -0.3761263890318287, 0.11283791670896592, -0.026866170630075903, 0.005223977428649761, -0.0008548312229989974, 0.00012054654502151287, -1.4906315067498891e-5, 1.6126444349070824e-6, -1.3074259056679966e-7)

r= fma(x,evalpoly(x2,PA),x) ## This fma is crucial for accuracy.

return r
else
## 0.5 <= a < 0.84375 - Use rational approximation.
Expand Down Expand Up @@ -158,12 +151,9 @@ function _erf(x::Float64)
P=evalpoly(a,NB)
Q=evalpoly(a,DB)

r= C + P / Q
return copysign(r,x)

if (sign)
return -C - P / Q
else
return C + P / Q
end
elseif (ia < 0x40000000)
## 1.25 <= |x| < 2.0.
a = abs(x)
Expand All @@ -174,13 +164,9 @@ function _erf(x::Float64)
PC=( 0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, -0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, -0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, -0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, 0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, -0x1.578c9e375d37p-19)

# Obtains erfc of |x|
r=evalpoly(a,PC)
r=1.0-evalpoly(a,PC)
return copysign(r,x)

if (sign)
return -1.0 + r
else
return 1.0 - r
end
elseif (ia < 0x400a0000)
## 2 <= |x| < 3.25.
a = abs(x)
Expand All @@ -189,16 +175,10 @@ function _erf(x::Float64)
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2
PD=( 0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, -0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, -0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, 0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, -0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, 0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 )


# Obtains erfc of |x|
r=evalpoly(a,PD)
r=1.0-evalpoly(a,PD)
return copysign(r,x)


if (sign)
return -1.0 + r
else
return 1.0 - r
end
elseif (ia < 0x40100000)
## 3.25 <= |x| < 4.0.
a = abs(x)
Expand All @@ -207,14 +187,9 @@ function _erf(x::Float64)
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25
PE=( 0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, -0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, 0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, -0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, -0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 )

r=evalpoly(a,PE)

r=1.0-evalpoly(a,PE)
return copysign(r,x)

if (sign)
return -1.0 + r
else
return 1.0 - r
end
elseif (ia < 0x4017a000)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any chance you can regenerate the polies to make this be 0x40180000? If you can, you would be able to use UInt16 literals for all of these by only taking the top 16 rather than top 32 bits.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would that have a meaningful impact?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably not much. It might just save a cycle or 2.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you wanted to test the speed, you could try it without regenerating the polys and it should give you a good idea.

## 4 <= |x| < 5.90625.
a = abs(x)
Expand All @@ -223,31 +198,19 @@ function _erf(x::Float64)
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4
PF=( 0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, -0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, 0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, -0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, 0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, -0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 )

r=evalpoly(a,PF)


if (sign)
return -1.0 + r
else
return 1.0 - r
end
r=1.0-evalpoly(a,PF)
return copysign(r,x)
else

if(isnan(x))
return NaN
end

if (sign)
return -1.0
else
return 1.0
end

copysign(1.0,x)
end


end

# Fast erf implementation using
# polynomial approximations of erf and erfc.
# Highest measured error is 1.12 ULPs at x = 1.232469
function _erf(x::Float32)
xabs=abs(x)

Expand All @@ -256,47 +219,38 @@ function _erf(x::Float32)
# # erf approximation using erf(x)=x+x*P(x^2) with degree 6
# Sollya::remez(erf(sqrt(x))/sqrt(x)-1,6,[1e-32;0.5],1,1e-32);

p=(0.128379167084072442015971708878153077794812875442854468252262977759538466466026108f0 ,-0.37612638677173360887708232154133809982218085424044576090554860522057182310851056f0 ,0.112837843774249243616280397250157777250507394914770984188915122197580134574091983f0 ,-2.68652865375831097248410803484628824158061934615600565461807890879374835870381413f-2 ,5.2188560068141289500478726517878958032487316358404909126782459380536157982980317f-3 ,-8.3948481565340715162646401327604259604058983841328238470236141987304564416187075f-4 )
p=(0.12837917f0, -0.37612638f0, 0.11283784f0, -0.026865287f0, 0.005218856f0, -0.0008394848f0)
return copysign(fma(evalpoly(x^2,p),x,x),x)


elseif(xabs<1.25)
# range [0.5;1.25]
# # erfc approximation with degree 11
# Sollya::remez(erfc(x+0.5),11,[0;1.25-0.5],1,1e-32);

p=(0.47950012218627633266759004480928346323282597139306026470914603497045110845486174f0 ,-0.87878257867905935820797978014037919344903083907274027447236660730320123742574287f0 ,0.43939127340837007019212165648359529028950189241566682841945406944095273511955846f0 ,0.14646415648788187125149469711213532491861248937046075996222718522704324115126361f0 ,-0.18308466657525286729680773848609379638263457954975049708494183866284386220432776f0 ,-7.2864220919698610613876940798326935208926368515022370715399462640511442447273458f-3 ,4.9870470155967931820488830916894059744198728056328087545059900897748278238141407f-2 ,-4.8868244174386309272067176761136942655656143898910274020685204636212063119576779f-3 ,-1.1067663329874328692394302924580932358903686528252386541750628802256971128485086f-2 ,3.422346846104011114882331840937679141622349918363223030054135870224315763897819f-3 ,7.3027064271433070734900533407935697302293380332736344236451943042328241926267078f-4 ,-3.75817085020390291674591289563810393435170791862443379088867595819930602228183736f-4)
p=(0.47950011f0, -0.8787826f0, 0.4393913f0, 0.14646415f0, -0.18308467f0, -0.007286422f0, 0.04987047f0, -0.0048868246f0, -0.011067663f0, 0.003422347f0, 0.00073027064f0, -0.0003758171f0)
return copysign(1f0-evalpoly(xabs-0.5f0,p),x)


elseif(xabs<2.5)
# range [1.25;2.5]
# erfc approximation with degree 13
# Sollya::remez(erfc(x+1.25),13,[1.25-1.25;2.5-1.25],1,1e-32);

p=(7.7099871743416328126356260566547494286908999516428835479787235777958146662395595f-2 ,-0.236521122401763914017434484798130978411662107330375092318490618856395191655478216f0,0.29565140031735793699573224918347147906505950384598003349376150036910634397925794f0 ,-0.1675357298956664512006671881841788827839151996664942041368823179421003600967294f0 ,6.1585929514334837247453472606059635765901344910897544510201614098797139034234648f-3 ,4.7187118101037459805673209483303989110029907488612006887134009408435903419517292f-2 ,-2.13310233800625824496628789189780598311678632855392393834221656949343403265787497f-2 ,-3.5262544206616132931628291419365561301271536625777609043061691511329815920675659f-3 ,5.4618311186313877025711961981462734764321122794521259327727086934478142942850845f-3 ,-4.785867288141399400801105017099155449896231371492841611827465311139149075249888f-4 ,-1.27638529129849191843323454760991411930129610353375308047589690520960413915824394f-3 ,7.3386942792237055752542546076918383420248583637581325208367761430501498054660812f-4 ,-1.78316578653402766600700240536421560298308014794597944402973993319799701405750455f-4 ,1.7451623823305231302742688272669315889569143556707803172195238203096188492888552f-5)
p=(0.077099875f0, -0.23652112f0, 0.2956514f0, -0.16753574f0, 0.006158593f0, 0.04718712f0, -0.021331023f0, -0.0035262543f0, 0.005461831f0, -0.00047858673f0, -0.0012763853f0, 0.00073386944f0, -0.00017831658f0, 1.7451624f-5)
return copysign(1f0-evalpoly(xabs-1.25f0,p),x)


elseif (xabs<4.0)
# range [2.5;4.0]
# erfc approximation with degree 13
# Sollya::remez(erfc(x+2.5),13,[0;4-2.5],1,1e-32);

p=(4.0695201730499156824979069048504659592517281936379802348048709133497117253173134f-4 ,-2.17828418992563158453260234674605666557610132217183615223667886784864793391972216f-3 ,5.4457086379161451966966115384340866958878328106709954104596733367011474611248579f-3 ,-8.3500528577413591092464067696190605106058226371648352228707616455735161644530442f-3 ,8.6220108431254600006776817786647116978577973677436276371791579294890933467761521f-3 ,-6.1151670580428489239821330412642128179703517796977571695036324539244245861906337f-3 ,2.7899458310817905157828387844634029030573127219171061924068857965376725014125688f-3 ,-5.1939500417227631511066424033635849427151215499427143347485235667556121428433924f-4 ,-3.0461048275173680879818513198875622675191891190039217228437020935156706025853604f-4 ,3.1068457426894474803882479013975662189036326925351567029013595934579911165234626f-4 ,-1.38668990773606991790025193904557899401436779771288081224199889420050208965460715f-4 ,3.6909690646602578301139502069782980693656088359691610474313757700491321631191208f-5 ,-5.6828887569336940854242289035551153938157998442490411471025066283558049904037765f-6 ,3.9297630357752347428080345871297027499036518175420666529979940028638158683656671f-7)
p=(0.00040695202f0, -0.002178284f0, 0.0054457085f0, -0.008350053f0, 0.008622011f0, -0.006115167f0, 0.0027899458f0, -0.000519395f0, -0.00030461047f0, 0.00031068458f0, -0.00013866898f0, 3.6909692f-5, -5.682889f-6, 3.929763f-7)
return copysign(1f0-evalpoly(xabs-2.5f0,p),x)


else
# |x| >= 4.0.


r=copysign(1f0,x)

return r

# range [4.0,inf)
r=copysign(1f0,x)
return r
end

end

_erf(x::Float16)=Float16(_erf(Float32(x)))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if you wanted to do a Float16 impl, it should be easier than the others. Specifically, the domain is only to 2, and the accuracy required is much reduced.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

100% could wait for a followup PR.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm thinking that too to be honest.
this and the poli regen.

Expand Down
Loading