forked from apicco/bootstrap
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathlle.py
More file actions
142 lines (111 loc) · 5.24 KB
/
lle.py
File metadata and controls
142 lines (111 loc) · 5.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
from rnd import pdf , cdf , rf
from scipy.optimize import minimize
import numpy as np
import warnings
c = cdf( 20 , 10 )
d = rf( 100 , c )
def LL( x , d ) :
# IMPORTANT NOTE: LL is defined as minus the log likelihood because its values
# will be optimised through a minimisation process. Hence, the values that maximise
# the log likelihood are those that minimise LL. This note will be important
# in the bootstrap method, as the outliers rejected are those whore rejection
# increases the log likelihood, thus decreases LL
pp = pdf( d , x[ 0 ] , x[ 1 ] )
return -np.nansum( np.log( pp ) )
# define the Shannon entropy scoring function
def S( mus , sigmas ) :
N = len( mus )
M = len( sigmas )
# convert list to numpy array
mus = np.array( [ mus[ i ][ 0 ] for i in range( N ) ] )
sigmas = np.array( [ sigmas[ i ][ 0 ] for i in range( M ) ] )
# conpute the inversion of the deltas. See point 6 in the section
# Quantification and Statistical Analysis - Image processing and
# distance measurements, in Picco et al. Cell 2018
np.seterr(divide='ignore', invalid='ignore')
Im = 1/np.abs( mus[ 1: ] - mus[ :-1 ] )
Is = 1/np.abs( sigmas[ 1: ] - sigmas[ :-1 ] )
p_m = Im / np.nansum( Im )
p_s = Is / np.nansum( Is )
return - p_m * np.log( p_m ) - p_s * np.log( p_s )
def optim( LL , x0 , d , verbose = False ) :
# compute the minimization catching warnings for the initial
# method choice that might not compel with the hess = True
# option
with warnings.catch_warnings() :
warnings.simplefilter( 'ignore' )
opt = minimize( LL , x0 , args = d , hess = True )
# extract the parameter estimate and compute their standard
# error estimate
x = opt['x']
se = np.sqrt( np.diagonal( opt['hess_inv'] ) )
# formatting and output
mu = [ x[ 0 ] , se[ 0 ] ]
sigma = [ x[ 1 ] , se[ 1 ] ]
if verbose : # print results
print( '-----OPTIM-----' )
print( 'mu : '
+ str( np.round( mu[ 0 ] , 2 ) )
+ ' +/- '
+ str( np.round( mu[ 1 ] , 2 ) ) )
print( 'sigma : '
+ str( np.round( sigma[ 0 ] , 2 ) )
+ ' +/- '
+ str( np.round( sigma[ 1 ] , 2 ) ) )
return mu , sigma
def bootstrap( LL , x0 , d , cutoff = np.nan ) :
d.sort() # sorting of the distances is important, because it will ease the
# min LL analysis by focusing only on the LL values past the LL max
# thus on those values coming from the tail of the distribution
# (i.e. the largest distances)
if cutoff != cutoff : cutoff = 2 * len( d ) / 3 # 5 in the R code, but I think it is safe to assume
# that more than half of the data area good
# order the distance values. Important outliers are in the right tail,
# so it is convenient to start removing those
m , s = optim( LL , x0 , d )
# storage for the incremental changes in the
# estimate of mu (m), sigma (s), and the
# Shannon entropy output (sh)
mu = [ m ]
sigma = [ s ]
dd = [ d ]
# start the outliers search
search = True
while( search ) :
# the number of distance measurements left
n = len( dd[ -1 ] )
# storage vector for the LL estimates when removing a distance
l = []
# storage vector for the distance dataset with each removed distance
dtmp = []
for i in range( n ) :
dtmp.append( [ dd[ -1 ][ j ] for j in range( n ) if j != i ] )
l.append( LL( x0 , dtmp[ i ] ) )
# keep the d that is 'more likely' to belong to the dataset (i.e. max likelihood )
# NOTE that LL has been defined with a minus for optin to search for a minimum. Thus
# here we need to search for a max instead of a minimum, and for a minimum, instead of a max
l_max = l.index( max( l ) ) # that is the index of the value most pertinent to the pdf, whose
# removal maximises the LL ( i.e. minimises the log likelihood)
ll = l[ l_max : ]
i_sel = l_max + ll.index( min( ll ) )
new_dataset = dtmp[ i_sel ]
# compute a new optimisation on the dataset without the 'less likely' distance
m , s = optim( LL , x0 = x0 , d = new_dataset , verbose = False )
# store the mu and sigma values
mu.append( m )
sigma.append( s )
dd.append( new_dataset )
if len( new_dataset ) < cutoff :
search = False
# compute and store the shannon entropy
sh = S( mu , sigma ).tolist()
print( '-----BOOTSTRAP-----' )
print( 'max Sh: ' + str( np.nanmax( sh ) ) )
i_max = sh.index( np.nanmax( sh ) )
print( 'mu = ' + str( mu[ i_max ][ 0 ] ) + ' +/- ' + str( mu[ i_max ][ 1 ] ) )
print( 'sigma = ' + str( sigma[ i_max ][ 0 ] ) + ' +/- ' + str( sigma[ i_max ][ 1 ] ) )
print( 'min Sh: ' + str( np.nanmin( sh ) ) )
i_min = sh.index( np.nanmin( sh ) )
print( 'mu = ' + str( mu[ i_min ][ 0 ] ) + ' +/- ' + str( mu[ i_min ][ 1 ] ) )
print( 'sigma = ' + str( sigma[ i_min ][ 0 ] ) + ' +/- ' + str( sigma[ i_min ][ 1 ] ) )
return mu , sigma , sh , i_min , i_max