Skip to content

Commit 58dc296

Browse files
committed
Improve tgamma accuracy by about 4 ULP
1 parent c45aa02 commit 58dc296

File tree

2 files changed

+123
-171
lines changed

2 files changed

+123
-171
lines changed

amd/device-libs/ocml/src/tgammaD.cl

Lines changed: 71 additions & 134 deletions
Original file line numberDiff line numberDiff line change
@@ -10,149 +10,86 @@
1010
CONSTATTR double
1111
MATH_MANGLE(tgamma)(double x)
1212
{
13-
const double pi = 3.14159265358979323846;
14-
15-
1613
double ax = BUILTIN_ABS_F64(x);
1714
double ret;
1815

19-
if (ax > 0x1.0p-11) {
20-
// For x < 4, push to [1-3] range using gamma(x) = gamma(x+1) / x
21-
// For 4.5 < x < 6.5, push above 6.5
22-
// [4,4.5) left alone
23-
double nterm = 1.0;
24-
double dterm = 1.0;
25-
double z = ax;
26-
if (ax < 4.5) {
27-
if (ax < 1.0) {
28-
dterm = z;
29-
z += 1.0;
30-
} else if (ax < 3.0) {
31-
; // do nothing
32-
} else if (ax < 4.0) {
33-
z -= 1.0;
34-
nterm = z;
16+
if (ax < 16.0) {
17+
double n, d;
18+
double y = x;
19+
if (x > 0.0) {
20+
n = 1.0;
21+
while (y > 2.5) {
22+
n = MATH_MAD(n, y, -n);
23+
y = y - 1.0;
24+
n = MATH_MAD(n, y, -n);
25+
y = y - 1.0;
3526
}
36-
} else if (ax < 5.5) {
37-
dterm = MATH_MAD(z,z,z);
38-
z += 2.0;
39-
} else if (ax < 6.5) {
40-
dterm = z;
41-
z += 1.0;
42-
}
43-
44-
double negadj = 1.0;
45-
if (x < 0.0) {
46-
negadj = -x * MATH_MANGLE(sinpi)(x);
47-
}
48-
49-
double etonegz = MATH_MANGLE(exp)(-z);
50-
51-
if (z < 4.5) {
52-
const double rn0 = 297.312130630940277;
53-
const double rn1 = 16926.1409177878806;
54-
const double rn2 = 131675.407800922036;
55-
const double rn3 = 344586.743316038732;
56-
const double rn4 = 440619.954224349898;
57-
const double rn5 = 275507.567385621460;
58-
const double rn6 = 84657.9644812230335;
59-
60-
const double rd0 = 1.00000000000000000;
61-
const double rd1 = -13.3400904528209096;
62-
const double rd2 = 3270.94389286527964;
63-
const double rd3 = 41972.5365974090031;
64-
const double rd4 = 123293.896672792281;
65-
const double rd5 = 166739.899991898533;
66-
const double rd6 = 107097.146935059144;
67-
const double rd7 = 33773.6414083704053;
68-
69-
double num = MATH_MAD(z,
70-
MATH_MAD(z,
71-
MATH_MAD(z,
72-
MATH_MAD(z,
73-
MATH_MAD(z,
74-
MATH_MAD(z, rn6, rn5),
75-
rn4),
76-
rn3),
77-
rn2),
78-
rn1),
79-
rn0) * nterm;
80-
81-
double den = MATH_MAD(z,
82-
MATH_MAD(z,
83-
MATH_MAD(z,
84-
MATH_MAD(z,
85-
MATH_MAD(z,
86-
MATH_MAD(z,
87-
MATH_MAD(z, rd7, rd6),
88-
rd5),
89-
rd4),
90-
rd3),
91-
rd2),
92-
rd1),
93-
rd0) * dterm;
94-
95-
double zpow = MATH_MANGLE(powr)(z, z+0.5);
96-
97-
if (x >= 0.0) {
98-
ret = etonegz * zpow * MATH_DIV(num,den);
99-
} else {
100-
ret = MATH_DIV(den*pi, negadj*etonegz*zpow*num);
101-
ret = BUILTIN_FRACTION_F64(x) == 0.0 ? QNAN_F64 : ret;
27+
if (y > 1.5) {
28+
n = MATH_MAD(n, y, -n);
29+
y = y - 1.0;
10230
}
31+
if (x >= 0.5)
32+
y = y - 1.0;
33+
d = x < 0.5 ? x : 1.0;
10334
} else {
104-
const double c0 = 2.5066282746310007;
105-
const double c1 = 0.20888568955258338;
106-
const double c2 = 0.008703570398024307;
107-
const double c3 = -0.0067210904740298821;
108-
const double c4 = -0.00057520123811017124;
109-
const double c5 = 0.0019652948815832029;
110-
const double c6 = 0.00017478252120455912;
111-
const double c7 = -0.0014843411351582762;
112-
const double c8 = -0.00012963757321125544;
113-
const double c9 = 0.0021043112297532062;
114-
const double c10 = 0.00018059994565555043;
115-
const double c11 = -0.0047987856705463457;
116-
const double c12 = -0.0004073678593815252;
117-
const double c13 = 0.01605085033194459500;
118-
const double c14 = 0.0013539922801590941;
119-
const double c15 = -0.074015421268427375;
120-
const double c16 = -0.0062208086788087787;
121-
const double c17 = 0.45004033385625097;
122-
123-
double rz = MATH_RCP(z);
124-
125-
double poly = MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz,
126-
MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz,
127-
MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz, MATH_MAD(rz,
128-
MATH_MAD(rz, MATH_MAD(rz,
129-
c17, c16), c15), c14), c13), c12), c11), c10), c9), c8),
130-
c7), c6), c5), c4), c3), c2), c1), c0);
131-
132-
double zpow = MATH_MANGLE(powr)(z, MATH_MAD(0.5, z, -0.25));
133-
if (x >= 0.0) {
134-
ret = MATH_DIV(etonegz*zpow*zpow*poly, dterm);
135-
ret = x > 0x1.573fae561f647p+7 ? PINF_F64 : ret;
136-
} else if (x < 0.0) {
137-
if (x >= -170.5) {
138-
ret = MATH_DIV(pi*dterm, etonegz*zpow*zpow*poly*negadj);
139-
} else if (x >= -184.0) {
140-
ret = MATH_DIV(MATH_DIV(pi*dterm, etonegz*zpow*poly), zpow*negadj);
141-
} else {
142-
ret = BUILTIN_COPYSIGN_F64(0.0, negadj);
143-
}
144-
ret = BUILTIN_FRACTION_F64(x) == 0.0 ? QNAN_F64 : ret;
145-
} else {
146-
ret = x;
35+
d = x;
36+
while (y < -1.5) {
37+
d = MATH_MAD(d, y, d);
38+
y = y + 1.0;
39+
d = MATH_MAD(d, y, d);
40+
y = y + 1.0;
14741
}
42+
if (y < -0.5) {
43+
d = MATH_MAD(d, y, d);
44+
y = y + 1.0;
45+
}
46+
n = 1.0;
14847
}
48+
double qt = MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y,
49+
MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y,
50+
MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y,
51+
MATH_MAD(y,
52+
-0x1.aed75feec7b9ap-23, 0x1.31854a0be3cd3p-20),
53+
-0x1.5037d6a97a8b7p-20), -0x1.51d67f2cdbcfbp-16),
54+
0x1.0c8ab2ac5112dp-13), -0x1.c364ce9b5e149p-13),
55+
-0x1.317113a39f929p-10), 0x1.d919c501178a3p-8),
56+
-0x1.3b4af282da690p-7), -0x1.59af103bf2cd0p-5),
57+
0x1.5512320b432ccp-3), -0x1.5815e8fa28886p-5),
58+
-0x1.4fcf4026afa24p-1), 0x1.2788cfc6fb61cp-1);
59+
60+
ret = MATH_DIV(n, MATH_MAD(d, y*qt, d));
61+
ret = x == 0.0 ? BUILTIN_COPYSIGN_F64(PINF_F64, x) : ret;
62+
ret = x < 0.0 && BUILTIN_FRACTION_F64(x) == 0.0 ? QNAN_F64 : ret;
14963
} else {
150-
const double c0 = -0x1.2788cfc6fb619p-1;
151-
const double c1 = 0x1.fa658c23b1578p-1;
152-
const double c2 = -0x1.d0a118f324b63p-1;
153-
const double c3 = 0x1.f6a51055096b5p-1;
154-
155-
ret = MATH_MAD(x, MATH_MAD(x, MATH_MAD(x, c3, c2), c1), c0) + MATH_RCP(x);
64+
const double sqrt2pi = 0x1.40d931ff62706p+1;
65+
const double sqrtpiby2 = 0x1.40d931ff62706p+0;
66+
67+
double t1 = MATH_MANGLE(powr)(ax, MATH_MAD(ax, 0.5, -0.25));
68+
double t2 = MATH_MANGLE(exp)(-ax);
69+
double xr = MATH_FAST_RCP(ax);
70+
double pt = MATH_MAD(xr, MATH_MAD(xr, MATH_MAD(xr, MATH_MAD(xr,
71+
MATH_MAD(xr, MATH_MAD(xr,
72+
-0x1.2b04c5ea74bbfp-11, 0x1.14869344f1d9bp-14),
73+
0x1.9b3457156ffefp-11), -0x1.e1427e86ee097p-13),
74+
-0x1.5f7266f67c4e0p-9), 0x1.c71c71c0f96adp-9),
75+
0x1.5555555555a28p-4);
76+
77+
if (x > 0.0) {
78+
double gt = sqrt2pi*t2*t1*t1;
79+
double g = MATH_MAD(gt, xr*pt, gt);
80+
ret = x > 0x1.573fae561f646p+7 ? PINF_F64 : g;
81+
} else {
82+
double s = -x * MATH_MANGLE(sinpi)(x);
83+
if (x > -170.5) {
84+
double d = s*t2*t1*t1;
85+
ret = MATH_DIV(sqrtpiby2, MATH_MAD(d, xr*pt, d));
86+
} else if (x > -184.0) {
87+
double d = t2*t1;
88+
ret = MATH_DIV(MATH_DIV(sqrtpiby2, MATH_MAD(d, xr*pt, d)), s*t1);
89+
} else
90+
ret = BUILTIN_COPYSIGN_F64(0.0, s);
91+
ret = BUILTIN_FRACTION_F64(x) == 0.0 || BUILTIN_ISNAN_F64(x) ? QNAN_F64 : ret;
92+
}
15693
}
15794

15895
return ret;

amd/device-libs/ocml/src/tgammaF.cl

Lines changed: 52 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -10,54 +10,69 @@
1010
CONSTATTR float
1111
MATH_MANGLE(tgamma)(float x)
1212
{
13-
const float pi = 0x1.921fb6p+1f;
14-
const float sqrt2pi = 0x1.40d932p+1f;
15-
const float sqrtpiby2 = 0x1.40d932p+0f;
1613
float ax = BUILTIN_ABS_F32(x);
1714
float ret;
1815

19-
if (ax > 0x1.0p-6f) {
20-
// For x < 3, push to larger value using gamma(x) = gamma(x+1) / x
21-
float d = 1.0f;
22-
if (ax < 1.0f) {
23-
d = MATH_MAD((ax + 3.0f), ax, 2.0f) * ax;
24-
ax = ax + 3.0f;
25-
} else if (ax < 2.0f) {
26-
d = MATH_MAD(ax, ax, ax);
27-
ax = ax + 2.0f;
28-
} else if (ax < 3.0f) {
29-
d = ax;
30-
ax = ax + 1.0f;
16+
if (ax < 16.0f) {
17+
float n, d;
18+
float y = x;
19+
if (x > 0.0f) {
20+
n = 1.0f;
21+
while (y > 2.5f) {
22+
n = MATH_MAD(n, y, -n);
23+
y = y - 1.0f;
24+
n = MATH_MAD(n, y, -n);
25+
y = y - 1.0f;
26+
}
27+
if (y > 1.5f) {
28+
n = MATH_MAD(n, y, -n);
29+
y = y - 1.0f;
30+
}
31+
if (x >= 0.5f)
32+
y = y - 1.0f;
33+
d = x < 0.5f ? x : 1.0f;
34+
} else {
35+
d = x;
36+
while (y < -1.5f) {
37+
d = MATH_MAD(d, y, d);
38+
y = y + 1.0f;
39+
d = MATH_MAD(d, y, d);
40+
y = y + 1.0f;
41+
}
42+
if (y < -0.5f) {
43+
d = MATH_MAD(d, y, d);
44+
y = y + 1.0f;
45+
}
46+
n = 1.0f;
3147
}
48+
float qt = MATH_MAD(y, MATH_MAD(y, MATH_MAD(y, MATH_MAD(y,
49+
MATH_MAD(y, MATH_MAD(y,
50+
0x1.d5a56ep-8f, -0x1.4dcb00p-7f), -0x1.59c03ap-5f), 0x1.55405ap-3f),
51+
-0x1.5810f2p-5f), -0x1.4fcfd6p-1f), 0x1.2788ccp-1f);
52+
ret = MATH_DIV(n, MATH_MAD(d, y*qt, d));
53+
ret = x == 0.0f ? BUILTIN_COPYSIGN_F32(PINF_F32, x) : ret;
54+
ret = x < 0.0f && BUILTIN_FRACTION_F32(x) == 0.0f ? QNAN_F32 : ret;
55+
} else {
56+
const float sqrt2pi = 0x1.40d932p+1f;
57+
const float sqrtpiby2 = 0x1.40d932p+0f;
3258

33-
// x^x e^-x (1 + poly(1/x)) sqrt(twopi / x) / d
34-
// Split x^x into a product since it overflows faster than gamma(x)
3559
float t1 = MATH_MANGLE(powr)(ax, MATH_MAD(ax, 0.5f, -0.25f));
3660
float t2 = MATH_MANGLE(exp)(-ax);
3761
float xr = MATH_FAST_RCP(ax);
38-
float pt = xr*MATH_MAD(xr, MATH_MAD(xr, -139.0f/51840.0f, 1.0f/288.0f) , 1.0f/12.0f);
62+
float p = MATH_MAD(xr, MATH_MAD(xr, 0x1.96d7e4p-9f, 0x1.556652p-4f), 0x1.fffff8p-1f);
3963
if (x > 0.0f) {
40-
float p = sqrt2pi*t2*t1*t1 * MATH_FAST_RCP(d);
41-
ret = MATH_MAD(p, pt, p);
42-
ret = x > 0x1.18521ep+5f ? PINF_F32 : ret;
64+
float g = sqrt2pi*t2*t1*t1*p;
65+
ret = x > 0x1.18521ep+5f ? PINF_F32 : g;
4366
} else {
44-
float s = MATH_MANGLE(sinpi)(x);
45-
if (x > -30.0f) {
46-
float p = s*x*t2*t1*t1;
47-
ret = MATH_DIV(-sqrtpiby2*d, MATH_MAD(p, pt, p));
48-
} else if (x > -41.0f) {
49-
float t3 = t2*t1;
50-
float p1 = MATH_MAD(t3, pt, t3);
51-
float p2 = s*x*t1;
52-
ret = MATH_DIV(MATH_DIV(-sqrtpiby2*d, p1), p2);
53-
} else
54-
ret = 0.0f;
55-
ret = BUILTIN_FRACTION_F32(x) == 0.0f ? QNAN_F32 : ret;
67+
float s = -x * MATH_MANGLE(sinpi)(x);
68+
if (x > -30.0f)
69+
ret = MATH_DIV(sqrtpiby2, s*t2*t1*t1*p);
70+
else if (x > -41.0f)
71+
ret = MATH_DIV(MATH_DIV(sqrtpiby2, t2*t1*p), s*t1);
72+
else
73+
ret = BUILTIN_COPYSIGN_F32(0.0f, s);
74+
ret = BUILTIN_FRACTION_F32(x) == 0.0f || BUILTIN_ISNAN_F32(x) ? QNAN_F32 : ret;
5675
}
57-
} else {
58-
ret = MATH_MAD(x, MATH_MAD(x, MATH_MAD(x,
59-
0x1.f6a510p-1f, -0x1.d0a118p-1f), 0x1.fa658cp-1f), -0x1.2788d0p-1f) +
60-
4.0f*MATH_FAST_RCP(4.0f*x);
6176
}
6277

6378
return ret;

0 commit comments

Comments
 (0)