9
9
from ..vartypes import discrete_types
10
10
11
11
import theano
12
- from ..theanof import make_shared_replacements , join_nonshared_inputs , CallableTensor , gradient
12
+ from ..theanof import make_shared_replacements , join_nonshared_inputs , CallableTensor
13
13
from theano .tensor import exp , dvector
14
14
import theano .tensor as tt
15
15
from theano .sandbox .rng_mrg import MRG_RandomStreams
@@ -29,7 +29,8 @@ def check_discrete_rvs(vars):
29
29
raise ValueError ('Model should not include discrete RVs for ADVI.' )
30
30
31
31
def advi (vars = None , start = None , model = None , n = 5000 , accurate_elbo = False ,
32
- learning_rate = .001 , epsilon = .1 , random_seed = 20090425 , verbose = 1 ):
32
+ optimizer = None , learning_rate = .001 , epsilon = .1 , random_seed = 20090425 ,
33
+ verbose = 1 ):
33
34
"""Perform automatic differentiation variational inference (ADVI).
34
35
35
36
This function implements the meanfield ADVI, where the variational
@@ -49,10 +50,12 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
49
50
transformed space, while traces returned by :code:`sample_vp()` are in
50
51
the original space as obtained by MCMC sampling methods in PyMC3.
51
52
52
- The variational parameters are optimized with a modified version of
53
- Adagrad, where only the last (n_window) gradient vectors are used to
54
- control the learning rate and older gradient vectors are ignored.
55
- n_window denotes the size of time window and fixed to 10.
53
+ The variational parameters are optimized with given optimizer, which is a
54
+ function that returns a dictionary of parameter updates as provided to
55
+ Theano function. If no optimizer is provided, optimization is performed
56
+ with a modified version of adagrad, where only the last (n_window) gradient
57
+ vectors are used to control the learning rate and older gradient vectors
58
+ are ignored. n_window denotes the size of time window and fixed to 10.
56
59
57
60
Parameters
58
61
----------
@@ -66,10 +69,16 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
66
69
Number of interations updating parameters.
67
70
accurate_elbo : bool
68
71
If true, 100 MC samples are used for accurate calculation of ELBO.
72
+ optimizer : (loss, tensor) -> dict or OrderedDict
73
+ A function that returns parameter updates given loss and parameter
74
+ tensor. If :code:`None` (default), a default Adagrad optimizer is
75
+ used with parameters :code:`learning_rate` and :code:`epsilon` below.
69
76
learning_rate: float
70
- Adagrad base learning rate.
77
+ Base learning rate for adagrad. This parameter is ignored when
78
+ optimizer is given.
71
79
epsilon : float
72
80
Offset in denominator of the scale of learning rate in Adagrad.
81
+ This parameter is ignored when optimizer is given.
73
82
random_seed : int
74
83
Seed to initialize random state.
75
84
@@ -99,9 +108,13 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
99
108
100
109
n_mcsamples = 100 if accurate_elbo else 1
101
110
111
+ # Prepare optimizer
112
+ if optimizer is None :
113
+ optimizer = adagrad_optimizer (learning_rate , epsilon )
114
+
102
115
# Create variational gradient tensor
103
- grad , elbo , shared , _ = variational_gradient_estimate (
104
- vars , model , n_mcsamples = n_mcsamples , random_seed = random_seed )
116
+ elbo , shared = _calc_elbo ( vars , model , n_mcsamples = n_mcsamples ,
117
+ random_seed = random_seed )
105
118
106
119
# Set starting values
107
120
for var , share in shared .items ():
@@ -113,51 +126,16 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
113
126
w_start = np .zeros_like (u_start )
114
127
uw = np .concatenate ([u_start , w_start ])
115
128
116
- result , elbos = run_adagrad (uw , grad , elbo , n , learning_rate = learning_rate , epsilon = epsilon , verbose = verbose )
117
-
118
- l = int (result .size / 2 )
119
-
120
- u = bij .rmap (result [:l ])
121
- w = bij .rmap (result [l :])
122
- # w is in log space
123
- for var in w .keys ():
124
- w [var ] = np .exp (w [var ])
125
- return ADVIFit (u , w , elbos )
126
-
127
- def replace_shared_minibatch_tensors (minibatch_tensors ):
128
- """Replace shared variables in minibatch tensors with normal tensors.
129
- """
130
- givens = dict ()
131
- tensors = list ()
132
-
133
- for t in minibatch_tensors :
134
- if isinstance (t , theano .compile .sharedvalue .SharedVariable ):
135
- t_ = t .type ()
136
- tensors .append (t_ )
137
- givens .update ({t : t_ })
138
- else :
139
- tensors .append (t )
140
-
141
- return tensors , givens
142
-
143
- def run_adagrad (uw , grad , elbo , n , learning_rate = .001 , epsilon = .1 , verbose = 1 ):
144
- """Run Adagrad parameter update.
145
-
146
- This function is only used in batch training.
147
- """
148
- shared_inarray = theano .shared (uw , 'uw_shared' )
149
- grad = CallableTensor (grad )(shared_inarray )
150
- elbo = CallableTensor (elbo )(shared_inarray )
151
-
152
- updates = adagrad (grad , shared_inarray , learning_rate = learning_rate , epsilon = epsilon , n = 10 )
129
+ # Create parameter update function used in the training loop
130
+ uw_shared = theano .shared (uw , 'uw_shared' )
131
+ elbo = CallableTensor (elbo )(uw_shared )
132
+ updates = optimizer (loss = - 1 * elbo , param = uw_shared )
133
+ f = theano .function ([], [uw_shared , elbo ], updates = updates )
153
134
154
- # Create in-place update function
155
- f = theano .function ([], [shared_inarray , grad , elbo ], updates = updates )
156
-
157
- # Run adagrad steps
135
+ # Optimization loop
158
136
elbos = np .empty (n )
159
137
for i in range (n ):
160
- uw_i , g , e = f ()
138
+ uw_i , e = f ()
161
139
elbos [i ] = e
162
140
if verbose and not i % (n // 10 ):
163
141
if not i :
@@ -169,24 +147,24 @@ def run_adagrad(uw, grad, elbo, n, learning_rate=.001, epsilon=.1, verbose=1):
169
147
if verbose :
170
148
avg_elbo = elbos [- n // 10 :].mean ()
171
149
print ('Finished [100%]: Average ELBO = {}' .format (avg_elbo .round (2 )))
172
- return uw_i , elbos
173
150
174
- def variational_gradient_estimate (
175
- vars , model , minibatch_RVs = [], minibatch_tensors = [], total_size = None ,
176
- n_mcsamples = 1 , random_seed = 20090425 ):
177
- """Calculate approximate ELBO and its (stochastic) gradient.
178
- """
151
+ # Estimated parameters
152
+ l = int (uw_i .size / 2 )
153
+ u = bij .rmap (uw_i [:l ])
154
+ w = bij .rmap (uw_i [l :])
155
+ # w is in log space
156
+ for var in w .keys ():
157
+ w [var ] = np .exp (w [var ])
158
+
159
+ return ADVIFit (u , w , elbos )
179
160
161
+ def _calc_elbo (vars , model , n_mcsamples , random_seed ):
162
+ """Calculate approximate ELBO.
163
+ """
180
164
theano .config .compute_test_value = 'ignore'
181
165
shared = make_shared_replacements (vars , model )
182
166
183
- # Correction sample size
184
- r = 1 if total_size is None else \
185
- float (total_size ) / minibatch_tensors [0 ].shape [0 ]
186
-
187
- other_RVs = set (model .basic_RVs ) - set (minibatch_RVs )
188
- factors = [r * var .logpt for var in minibatch_RVs ] + \
189
- [var .logpt for var in other_RVs ] + model .potentials
167
+ factors = [var .logpt for var in model .basic_RVs ] + model .potentials
190
168
logpt = tt .add (* map (tt .sum , factors ))
191
169
192
170
[logp ], inarray = join_nonshared_inputs ([logpt ], vars , shared )
@@ -195,14 +173,11 @@ def variational_gradient_estimate(
195
173
uw .tag .test_value = np .concatenate ([inarray .tag .test_value ,
196
174
inarray .tag .test_value ])
197
175
198
- elbo = elbo_t (logp , uw , inarray , n_mcsamples = n_mcsamples , random_seed = random_seed )
199
-
200
- # Gradient
201
- grad = gradient (elbo , [uw ])
176
+ elbo = _elbo_t (logp , uw , inarray , n_mcsamples , random_seed )
202
177
203
- return grad , elbo , shared , uw
178
+ return elbo , shared
204
179
205
- def elbo_t (logp , uw , inarray , n_mcsamples , random_seed ):
180
+ def _elbo_t (logp , uw , inarray , n_mcsamples , random_seed ):
206
181
"""Create Theano tensor of approximate ELBO by Monte Carlo sampling.
207
182
"""
208
183
l = (uw .size / 2 ).astype ('int64' )
@@ -229,26 +204,50 @@ def elbo_t(logp, uw, inarray, n_mcsamples, random_seed):
229
204
230
205
return elbo
231
206
232
- def adagrad (grad , param , learning_rate , epsilon , n ):
233
- """Create Theano parameter (tensor) updates by Adagrad.
207
+ def adagrad_optimizer (learning_rate , epsilon , n_win = 10 ):
208
+ """Returns a function that returns parameter updates.
209
+
210
+ Parameter
211
+ ---------
212
+ learning_rate : float
213
+ Learning rate.
214
+ epsilon : float
215
+ Offset to avoid zero-division in the normalizer of adagrad.
216
+ n_win : int
217
+ Number of past steps to calculate scales of parameter gradients.
218
+
219
+ Returns
220
+ -------
221
+ A function (loss, param) -> updates.
222
+
223
+ loss : Theano scalar
224
+ Loss function to be minimized (e.g., negative ELBO).
225
+ param : Theano tensor
226
+ Parameters to be optimized.
227
+ updates : OrderedDict
228
+ Parameter updates used in Theano functions.
234
229
"""
235
- # Compute windowed adagrad using last n gradients
236
- i = theano .shared (np .array (0 ), 'i' )
237
- value = param .get_value (borrow = True )
238
- accu = theano .shared (np .zeros (value .shape + (n ,), dtype = value .dtype ))
239
-
240
- # Append squared gradient vector to accu_new
241
- accu_new = tt .set_subtensor (accu [:,i ], grad ** 2 )
242
- i_new = tt .switch ((i + 1 ) < n , i + 1 , 0 )
243
-
244
- updates = OrderedDict ()
245
- updates [accu ] = accu_new
246
- updates [i ] = i_new
247
-
248
- accu_sum = accu_new .sum (axis = 1 )
249
- updates [param ] = param - (- learning_rate * grad /
250
- tt .sqrt (accu_sum + epsilon ))
251
- return updates
230
+ def optimizer (loss , param ):
231
+ i = theano .shared (np .array (0 ))
232
+ value = param .get_value (borrow = True )
233
+ accu = theano .shared (np .zeros (value .shape + (n_win ,), dtype = value .dtype ))
234
+ # grad = tt.grad(loss, wrt=param)
235
+ grad = tt .grad (loss , param )
236
+
237
+ # Append squared gradient vector to accu_new
238
+ accu_new = tt .set_subtensor (accu [:,i ], grad ** 2 )
239
+ i_new = tt .switch ((i + 1 ) < n_win , i + 1 , 0 )
240
+
241
+ updates = OrderedDict ()
242
+ updates [accu ] = accu_new
243
+ updates [i ] = i_new
244
+
245
+ accu_sum = accu_new .sum (axis = 1 )
246
+ updates [param ] = param - (learning_rate * grad /
247
+ tt .sqrt (accu_sum + epsilon ))
248
+
249
+ return updates
250
+ return optimizer
252
251
253
252
def sample_vp (
254
253
vparams , draws = 1000 , model = None , local_RVs = None , random_seed = 20090425 ,
0 commit comments