Skip to content

Commit 0dc01e2

Browse files
Merge pull request #114 from el-hult/nanhunt
Improve calibration
2 parents ffa7227 + 6ae1d9e commit 0dc01e2

File tree

2 files changed

+44
-37
lines changed

2 files changed

+44
-37
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,3 +74,6 @@ paper/*.txt
7474

7575
# Jupiter notebooks
7676
.ipynb_checkpoints
77+
78+
# pyenv environment
79+
.python-version

forestci/calibration.py

Lines changed: 41 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,7 @@
55
random forest is small.
66
77
"""
8-
import functools
9-
import itertools
8+
import warnings
109
import numpy as np
1110
from scipy.optimize import minimize
1211
from scipy.signal import fftconvolve
@@ -32,7 +31,7 @@
3231
path='forestci')
3332

3433

35-
def gfit(X, sigma, p=5, nbin=200, unif_fraction=0.1):
34+
def gfit(X, sigma, p=2, nbin=1000, unif_fraction=0.1):
3635
"""
3736
Fit empirical Bayes prior in the hierarchical model [Efron2014]_.
3837
@@ -47,62 +46,66 @@ def gfit(X, sigma, p=5, nbin=200, unif_fraction=0.1):
4746
sigma: float
4847
Noise estimate on X.
4948
p: int
50-
Number of parameters used to fit G. Default: 5
49+
Number of parameters used to fit G.
5150
nbin: int
5251
Number of bins used for discrete approximation.
53-
Default: 200
5452
unif_fraction: float
55-
Fraction of G modeled as "slab". Default: 0.1
53+
Fraction of G modeled as "slab".
5654
5755
Returns
5856
-------
5957
An array of the posterior density estimate g.
6058
"""
61-
min_x = min(min(X) - 2 * np.std(X, ddof=1), 0)
59+
min_x = max(min(X) - 2 * np.std(X, ddof=1), 0)
6260
max_x = max(max(X) + 2 * np.std(X, ddof=1),
6361
np.std(X, ddof=1))
6462
xvals = np.linspace(min_x, max_x, nbin)
65-
binw = (max_x - min_x) / (nbin - 1)
6663

67-
zero_idx = max(np.where(xvals <= 0)[0])
68-
noise_kernel = norm().pdf(xvals / sigma) * binw / sigma
64+
noise_kernel = norm(scale=sigma,loc=xvals.mean()).pdf(xvals)
65+
noise_kernel /= noise_kernel.sum()
6966

70-
if zero_idx > 0:
71-
noise_rotate = noise_kernel[list(np.arange(zero_idx, len(xvals))) +
72-
list(np.arange(0, zero_idx))]
73-
else:
74-
noise_rotate = noise_kernel
67+
mask = xvals > 0
68+
assert sum(mask) > 0
69+
g_eta_slab = mask / sum(mask)
7570

76-
XX = np.zeros((p, len(xvals)), dtype="float")
77-
for ind, exp in enumerate(range(1, p+1)):
78-
mask = np.ones_like(xvals)
79-
mask[np.where(xvals <= 0)[0]] = 0
80-
XX[ind, :] = pow(xvals, exp) * mask
81-
XX = XX.T
71+
XX = np.column_stack([ pow(xvals, exp) for exp in range(1, p+1)])
72+
XX /= np.sum(XX,axis = 0, keepdims=True) # normalize each feature column for better numerical stability
8273

8374
def neg_loglik(eta):
84-
mask = np.ones_like(xvals)
85-
mask[np.where(xvals <= 0)[0]] = 0
86-
g_eta_raw = np.exp(np.dot(XX, eta)) * mask
75+
with np.errstate(over='ignore'):
76+
# if eta > 0 the exponential will likely get overflow. that is fine.
77+
g_eta_raw = np.exp(np.dot(XX, eta)) * mask
78+
8779
if ((np.sum(g_eta_raw) == np.inf) |
8880
(np.sum(g_eta_raw) <=
8981
100 * np.finfo(np.double).tiny)):
9082
return (1000 * (len(X) + sum(eta ** 2)))
9183

84+
assert sum(g_eta_raw) > 0, "Unexpected error"
85+
assert np.isfinite(sum(g_eta_raw)), "Unexpected error"
9286
g_eta_main = g_eta_raw / sum(g_eta_raw)
93-
g_eta = ((1 - unif_fraction) * g_eta_main +
94-
unif_fraction * mask / sum(mask))
95-
f_eta = fftconvolve(g_eta, noise_rotate, mode='same')
87+
g_eta = (
88+
(1 - unif_fraction) * g_eta_main +
89+
unif_fraction * g_eta_slab)
90+
f_eta = fftconvolve(g_eta, noise_kernel, mode='same')
9691
return np.sum(np.interp(X, xvals,
9792
-np.log(np.maximum(f_eta, 0.0000001))))
9893

99-
eta_hat = minimize(neg_loglik,
100-
list(itertools.repeat(-1, p))).x
94+
res = minimize(
95+
neg_loglik,
96+
np.full(p, -1, dtype='float'),
97+
tol=5e-5 # adjusted so that the MPG example in the docs passes
98+
)
99+
if not res.success:
100+
warnings.warn("Fitting the empirical bayes prior failed with message %s." % res.message)
101+
eta_hat = res.x
101102
g_eta_raw = np.exp(np.dot(XX, eta_hat)) * mask
102103
g_eta_main = g_eta_raw / sum(g_eta_raw)
103-
g_eta = ((1 - unif_fraction) * g_eta_main +
104-
unif_fraction * mask) / sum(mask)
104+
g_eta = (
105+
(1 - unif_fraction) * g_eta_main +
106+
unif_fraction * g_eta_slab)
105107

108+
assert np.all(np.isfinite(g_eta)), "Fitting the empirical bayes prior failed."
106109
return xvals, g_eta
107110

108111

@@ -114,8 +117,10 @@ def gbayes(x0, g_est, sigma):
114117
----------
115118
x0: ndarray
116119
an observation
117-
g_est: float
120+
g_est: (ndarray,ndarray)
118121
a prior density, as returned by gfit
122+
g_est[0] is the x-positions
123+
g_est[1] is the densities
119124
sigma: int
120125
noise estimate
121126
@@ -147,18 +152,17 @@ def calibrateEB(variances, sigma2):
147152
"""
148153
if (sigma2 <= 0 or min(variances) == max(variances)):
149154
return(np.maximum(variances, 0))
155+
150156
sigma = np.sqrt(sigma2)
151157
eb_prior = gfit(variances, sigma)
152-
# Set up a partial execution of the function
153-
part = functools.partial(gbayes, g_est=eb_prior,
154-
sigma=sigma)
158+
155159
if len(variances) >= 200:
156160
# Interpolate to speed up computations:
157161
calib_x = np.percentile(variances,
158162
np.arange(0, 102, 2))
159-
calib_y = list(map(part, calib_x))
163+
calib_y = [gbayes(x,g_est=eb_prior,sigma=sigma) for x in calib_x]
160164
calib_all = np.interp(variances, calib_x, calib_y)
161165
else:
162-
calib_all = list(map(part, variances))
166+
calib_all = [gbayes(x,g_est=eb_prior,sigma=sigma) for x in variances]
163167

164168
return np.asarray(calib_all)

0 commit comments

Comments
 (0)