55random forest is small.
66
77"""
8- import functools
9- import itertools
8+ import warnings
109import numpy as np
1110from scipy .optimize import minimize
1211from scipy .signal import fftconvolve
3231 path = 'forestci' )
3332
3433
35- def gfit (X , sigma , p = 5 , nbin = 200 , unif_fraction = 0.1 ):
34+ def gfit (X , sigma , p = 2 , nbin = 1000 , unif_fraction = 0.1 ):
3635 """
3736 Fit empirical Bayes prior in the hierarchical model [Efron2014]_.
3837
@@ -47,62 +46,66 @@ def gfit(X, sigma, p=5, nbin=200, unif_fraction=0.1):
4746 sigma: float
4847 Noise estimate on X.
4948 p: int
50- Number of parameters used to fit G. Default: 5
49+ Number of parameters used to fit G.
5150 nbin: int
5251 Number of bins used for discrete approximation.
53- Default: 200
5452 unif_fraction: float
55- Fraction of G modeled as "slab". Default: 0.1
53+ Fraction of G modeled as "slab".
5654
5755 Returns
5856 -------
5957 An array of the posterior density estimate g.
6058 """
61- min_x = min (min (X ) - 2 * np .std (X , ddof = 1 ), 0 )
59+ min_x = max (min (X ) - 2 * np .std (X , ddof = 1 ), 0 )
6260 max_x = max (max (X ) + 2 * np .std (X , ddof = 1 ),
6361 np .std (X , ddof = 1 ))
6462 xvals = np .linspace (min_x , max_x , nbin )
65- binw = (max_x - min_x ) / (nbin - 1 )
6663
67- zero_idx = max ( np . where ( xvals <= 0 )[ 0 ] )
68- noise_kernel = norm (). pdf ( xvals / sigma ) * binw / sigma
64+ noise_kernel = norm ( scale = sigma , loc = xvals . mean ()). pdf ( xvals )
65+ noise_kernel /= noise_kernel . sum ()
6966
70- if zero_idx > 0 :
71- noise_rotate = noise_kernel [list (np .arange (zero_idx , len (xvals ))) +
72- list (np .arange (0 , zero_idx ))]
73- else :
74- noise_rotate = noise_kernel
67+ mask = xvals > 0
68+ assert sum (mask ) > 0
69+ g_eta_slab = mask / sum (mask )
7570
76- XX = np .zeros ((p , len (xvals )), dtype = "float" )
77- for ind , exp in enumerate (range (1 , p + 1 )):
78- mask = np .ones_like (xvals )
79- mask [np .where (xvals <= 0 )[0 ]] = 0
80- XX [ind , :] = pow (xvals , exp ) * mask
81- XX = XX .T
71+ XX = np .column_stack ([ pow (xvals , exp ) for exp in range (1 , p + 1 )])
72+ XX /= np .sum (XX ,axis = 0 , keepdims = True ) # normalize each feature column for better numerical stability
8273
8374 def neg_loglik (eta ):
84- mask = np .ones_like (xvals )
85- mask [np .where (xvals <= 0 )[0 ]] = 0
86- g_eta_raw = np .exp (np .dot (XX , eta )) * mask
75+ with np .errstate (over = 'ignore' ):
76+ # if eta > 0 the exponential will likely get overflow. that is fine.
77+ g_eta_raw = np .exp (np .dot (XX , eta )) * mask
78+
8779 if ((np .sum (g_eta_raw ) == np .inf ) |
8880 (np .sum (g_eta_raw ) <=
8981 100 * np .finfo (np .double ).tiny )):
9082 return (1000 * (len (X ) + sum (eta ** 2 )))
9183
84+ assert sum (g_eta_raw ) > 0 , "Unexpected error"
85+ assert np .isfinite (sum (g_eta_raw )), "Unexpected error"
9286 g_eta_main = g_eta_raw / sum (g_eta_raw )
93- g_eta = ((1 - unif_fraction ) * g_eta_main +
94- unif_fraction * mask / sum (mask ))
95- f_eta = fftconvolve (g_eta , noise_rotate , mode = 'same' )
87+ g_eta = (
88+ (1 - unif_fraction ) * g_eta_main +
89+ unif_fraction * g_eta_slab )
90+ f_eta = fftconvolve (g_eta , noise_kernel , mode = 'same' )
9691 return np .sum (np .interp (X , xvals ,
9792 - np .log (np .maximum (f_eta , 0.0000001 ))))
9893
99- eta_hat = minimize (neg_loglik ,
100- list (itertools .repeat (- 1 , p ))).x
94+ res = minimize (
95+ neg_loglik ,
96+ np .full (p , - 1 , dtype = 'float' ),
97+ tol = 5e-5 # adjusted so that the MPG example in the docs passes
98+ )
99+ if not res .success :
100+ warnings .warn ("Fitting the empirical bayes prior failed with message %s." % res .message )
101+ eta_hat = res .x
101102 g_eta_raw = np .exp (np .dot (XX , eta_hat )) * mask
102103 g_eta_main = g_eta_raw / sum (g_eta_raw )
103- g_eta = ((1 - unif_fraction ) * g_eta_main +
104- unif_fraction * mask ) / sum (mask )
104+ g_eta = (
105+ (1 - unif_fraction ) * g_eta_main +
106+ unif_fraction * g_eta_slab )
105107
108+ assert np .all (np .isfinite (g_eta )), "Fitting the empirical bayes prior failed."
106109 return xvals , g_eta
107110
108111
@@ -114,8 +117,10 @@ def gbayes(x0, g_est, sigma):
114117 ----------
115118 x0: ndarray
116119 an observation
117- g_est: float
120+ g_est: (ndarray,ndarray)
118121 a prior density, as returned by gfit
122+ g_est[0] is the x-positions
123+ g_est[1] is the densities
119124 sigma: int
120125 noise estimate
121126
@@ -147,18 +152,17 @@ def calibrateEB(variances, sigma2):
147152 """
148153 if (sigma2 <= 0 or min (variances ) == max (variances )):
149154 return (np .maximum (variances , 0 ))
155+
150156 sigma = np .sqrt (sigma2 )
151157 eb_prior = gfit (variances , sigma )
152- # Set up a partial execution of the function
153- part = functools .partial (gbayes , g_est = eb_prior ,
154- sigma = sigma )
158+
155159 if len (variances ) >= 200 :
156160 # Interpolate to speed up computations:
157161 calib_x = np .percentile (variances ,
158162 np .arange (0 , 102 , 2 ))
159- calib_y = list ( map ( part , calib_x ))
163+ calib_y = [ gbayes ( x , g_est = eb_prior , sigma = sigma ) for x in calib_x ]
160164 calib_all = np .interp (variances , calib_x , calib_y )
161165 else :
162- calib_all = list ( map ( part , variances ))
166+ calib_all = [ gbayes ( x , g_est = eb_prior , sigma = sigma ) for x in variances ]
163167
164168 return np .asarray (calib_all )
0 commit comments