Skip to content

Commit f34c0b3

Browse files
committed
ENH Support arbitrary optimizer.
1 parent 641c0f5 commit f34c0b3

File tree

2 files changed

+110
-117
lines changed

2 files changed

+110
-117
lines changed

pymc3/variational/advi.py

Lines changed: 88 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from ..vartypes import discrete_types
1010

1111
import theano
12-
from ..theanof import make_shared_replacements, join_nonshared_inputs, CallableTensor, gradient
12+
from ..theanof import make_shared_replacements, join_nonshared_inputs, CallableTensor
1313
from theano.tensor import exp, dvector
1414
import theano.tensor as tt
1515
from theano.sandbox.rng_mrg import MRG_RandomStreams
@@ -29,7 +29,8 @@ def check_discrete_rvs(vars):
2929
raise ValueError('Model should not include discrete RVs for ADVI.')
3030

3131
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
32-
learning_rate=.001, epsilon=.1, random_seed=20090425, verbose=1):
32+
optimizer=None, learning_rate=.001, epsilon=.1, random_seed=20090425,
33+
verbose=1):
3334
"""Perform automatic differentiation variational inference (ADVI).
3435
3536
This function implements the meanfield ADVI, where the variational
@@ -49,10 +50,12 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
4950
transformed space, while traces returned by :code:`sample_vp()` are in
5051
the original space as obtained by MCMC sampling methods in PyMC3.
5152
52-
The variational parameters are optimized with a modified version of
53-
Adagrad, where only the last (n_window) gradient vectors are used to
54-
control the learning rate and older gradient vectors are ignored.
55-
n_window denotes the size of time window and fixed to 10.
53+
The variational parameters are optimized with given optimizer, which is a
54+
function that returns a dictionary of parameter updates as provided to
55+
Theano function. If no optimizer is provided, optimization is performed
56+
with a modified version of adagrad, where only the last (n_window) gradient
57+
vectors are used to control the learning rate and older gradient vectors
58+
are ignored. n_window denotes the size of time window and fixed to 10.
5659
5760
Parameters
5861
----------
@@ -66,10 +69,16 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
6669
Number of interations updating parameters.
6770
accurate_elbo : bool
6871
If true, 100 MC samples are used for accurate calculation of ELBO.
72+
optimizer : (loss, tensor) -> dict or OrderedDict
73+
A function that returns parameter updates given loss and parameter
74+
tensor. If :code:`None` (default), a default Adagrad optimizer is
75+
used with parameters :code:`learning_rate` and :code:`epsilon` below.
6976
learning_rate: float
70-
Adagrad base learning rate.
77+
Base learning rate for adagrad. This parameter is ignored when
78+
optimizer is given.
7179
epsilon : float
7280
Offset in denominator of the scale of learning rate in Adagrad.
81+
This parameter is ignored when optimizer is given.
7382
random_seed : int
7483
Seed to initialize random state.
7584
@@ -99,9 +108,13 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
99108

100109
n_mcsamples = 100 if accurate_elbo else 1
101110

111+
# Prepare optimizer
112+
if optimizer is None:
113+
optimizer = adagrad_optimizer(learning_rate, epsilon)
114+
102115
# Create variational gradient tensor
103-
grad, elbo, shared, _ = variational_gradient_estimate(
104-
vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed)
116+
elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples,
117+
random_seed=random_seed)
105118

106119
# Set starting values
107120
for var, share in shared.items():
@@ -113,51 +126,16 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
113126
w_start = np.zeros_like(u_start)
114127
uw = np.concatenate([u_start, w_start])
115128

116-
result, elbos = run_adagrad(uw, grad, elbo, n, learning_rate=learning_rate, epsilon=epsilon, verbose=verbose)
117-
118-
l = int(result.size / 2)
119-
120-
u = bij.rmap(result[:l])
121-
w = bij.rmap(result[l:])
122-
# w is in log space
123-
for var in w.keys():
124-
w[var] = np.exp(w[var])
125-
return ADVIFit(u, w, elbos)
126-
127-
def replace_shared_minibatch_tensors(minibatch_tensors):
128-
"""Replace shared variables in minibatch tensors with normal tensors.
129-
"""
130-
givens = dict()
131-
tensors = list()
132-
133-
for t in minibatch_tensors:
134-
if isinstance(t, theano.compile.sharedvalue.SharedVariable):
135-
t_ = t.type()
136-
tensors.append(t_)
137-
givens.update({t: t_})
138-
else:
139-
tensors.append(t)
140-
141-
return tensors, givens
142-
143-
def run_adagrad(uw, grad, elbo, n, learning_rate=.001, epsilon=.1, verbose=1):
144-
"""Run Adagrad parameter update.
145-
146-
This function is only used in batch training.
147-
"""
148-
shared_inarray = theano.shared(uw, 'uw_shared')
149-
grad = CallableTensor(grad)(shared_inarray)
150-
elbo = CallableTensor(elbo)(shared_inarray)
151-
152-
updates = adagrad(grad, shared_inarray, learning_rate=learning_rate, epsilon=epsilon, n=10)
129+
# Create parameter update function used in the training loop
130+
uw_shared = theano.shared(uw, 'uw_shared')
131+
elbo = CallableTensor(elbo)(uw_shared)
132+
updates = optimizer(loss=-1 * elbo, param=uw_shared)
133+
f = theano.function([], [uw_shared, elbo], updates=updates)
153134

154-
# Create in-place update function
155-
f = theano.function([], [shared_inarray, grad, elbo], updates=updates)
156-
157-
# Run adagrad steps
135+
# Optimization loop
158136
elbos = np.empty(n)
159137
for i in range(n):
160-
uw_i, g, e = f()
138+
uw_i, e = f()
161139
elbos[i] = e
162140
if verbose and not i % (n//10):
163141
if not i:
@@ -169,24 +147,24 @@ def run_adagrad(uw, grad, elbo, n, learning_rate=.001, epsilon=.1, verbose=1):
169147
if verbose:
170148
avg_elbo = elbos[-n//10:].mean()
171149
print('Finished [100%]: Average ELBO = {}'.format(avg_elbo.round(2)))
172-
return uw_i, elbos
173150

174-
def variational_gradient_estimate(
175-
vars, model, minibatch_RVs=[], minibatch_tensors=[], total_size=None,
176-
n_mcsamples=1, random_seed=20090425):
177-
"""Calculate approximate ELBO and its (stochastic) gradient.
178-
"""
151+
# Estimated parameters
152+
l = int(uw_i.size / 2)
153+
u = bij.rmap(uw_i[:l])
154+
w = bij.rmap(uw_i[l:])
155+
# w is in log space
156+
for var in w.keys():
157+
w[var] = np.exp(w[var])
158+
159+
return ADVIFit(u, w, elbos)
179160

161+
def _calc_elbo(vars, model, n_mcsamples, random_seed):
162+
"""Calculate approximate ELBO.
163+
"""
180164
theano.config.compute_test_value = 'ignore'
181165
shared = make_shared_replacements(vars, model)
182166

183-
# Correction sample size
184-
r = 1 if total_size is None else \
185-
float(total_size) / minibatch_tensors[0].shape[0]
186-
187-
other_RVs = set(model.basic_RVs) - set(minibatch_RVs)
188-
factors = [r * var.logpt for var in minibatch_RVs] + \
189-
[var.logpt for var in other_RVs] + model.potentials
167+
factors = [var.logpt for var in model.basic_RVs] + model.potentials
190168
logpt = tt.add(*map(tt.sum, factors))
191169

192170
[logp], inarray = join_nonshared_inputs([logpt], vars, shared)
@@ -195,14 +173,11 @@ def variational_gradient_estimate(
195173
uw.tag.test_value = np.concatenate([inarray.tag.test_value,
196174
inarray.tag.test_value])
197175

198-
elbo = elbo_t(logp, uw, inarray, n_mcsamples=n_mcsamples, random_seed=random_seed)
199-
200-
# Gradient
201-
grad = gradient(elbo, [uw])
176+
elbo = _elbo_t(logp, uw, inarray, n_mcsamples, random_seed)
202177

203-
return grad, elbo, shared, uw
178+
return elbo, shared
204179

205-
def elbo_t(logp, uw, inarray, n_mcsamples, random_seed):
180+
def _elbo_t(logp, uw, inarray, n_mcsamples, random_seed):
206181
"""Create Theano tensor of approximate ELBO by Monte Carlo sampling.
207182
"""
208183
l = (uw.size/2).astype('int64')
@@ -229,26 +204,50 @@ def elbo_t(logp, uw, inarray, n_mcsamples, random_seed):
229204

230205
return elbo
231206

232-
def adagrad(grad, param, learning_rate, epsilon, n):
233-
"""Create Theano parameter (tensor) updates by Adagrad.
207+
def adagrad_optimizer(learning_rate, epsilon, n_win=10):
208+
"""Returns a function that returns parameter updates.
209+
210+
Parameter
211+
---------
212+
learning_rate : float
213+
Learning rate.
214+
epsilon : float
215+
Offset to avoid zero-division in the normalizer of adagrad.
216+
n_win : int
217+
Number of past steps to calculate scales of parameter gradients.
218+
219+
Returns
220+
-------
221+
A function (loss, param) -> updates.
222+
223+
loss : Theano scalar
224+
Loss function to be minimized (e.g., negative ELBO).
225+
param : Theano tensor
226+
Parameters to be optimized.
227+
updates : OrderedDict
228+
Parameter updates used in Theano functions.
234229
"""
235-
# Compute windowed adagrad using last n gradients
236-
i = theano.shared(np.array(0), 'i')
237-
value = param.get_value(borrow=True)
238-
accu = theano.shared(np.zeros(value.shape+(n,), dtype=value.dtype))
239-
240-
# Append squared gradient vector to accu_new
241-
accu_new = tt.set_subtensor(accu[:,i], grad ** 2)
242-
i_new = tt.switch((i + 1) < n, i + 1, 0)
243-
244-
updates = OrderedDict()
245-
updates[accu] = accu_new
246-
updates[i] = i_new
247-
248-
accu_sum = accu_new.sum(axis=1)
249-
updates[param] = param - (-learning_rate * grad /
250-
tt.sqrt(accu_sum + epsilon))
251-
return updates
230+
def optimizer(loss, param):
231+
i = theano.shared(np.array(0))
232+
value = param.get_value(borrow=True)
233+
accu = theano.shared(np.zeros(value.shape+(n_win,), dtype=value.dtype))
234+
# grad = tt.grad(loss, wrt=param)
235+
grad = tt.grad(loss, param)
236+
237+
# Append squared gradient vector to accu_new
238+
accu_new = tt.set_subtensor(accu[:,i], grad ** 2)
239+
i_new = tt.switch((i + 1) < n_win, i + 1, 0)
240+
241+
updates = OrderedDict()
242+
updates[accu] = accu_new
243+
updates[i] = i_new
244+
245+
accu_sum = accu_new.sum(axis=1)
246+
updates[param] = param - (learning_rate * grad /
247+
tt.sqrt(accu_sum + epsilon))
248+
249+
return updates
250+
return optimizer
252251

253252
def sample_vp(
254253
vparams, draws=1000, model=None, local_RVs=None, random_seed=20090425,

pymc3/variational/advi_minibatch.py

Lines changed: 22 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import theano.tensor as tt
77
from theano.sandbox.rng_mrg import MRG_RandomStreams
88
from collections import OrderedDict
9-
from .advi import check_discrete_rvs, adagrad, ADVIFit
9+
from .advi import check_discrete_rvs, ADVIFit, adagrad_optimizer
1010

1111
__all__ = ['advi_minibatch']
1212

@@ -38,22 +38,6 @@ def _check_minibatches(minibatch_tensors, minibatches):
3838
_value_error(hasattr(minibatches, "__iter__"),
3939
'minibatches must be an iterator.')
4040

41-
def _replace_shared_minibatch_tensors(minibatch_tensors):
42-
"""Replace shared variables in minibatch tensors with normal tensors.
43-
"""
44-
givens = dict()
45-
tensors = list()
46-
47-
for t in minibatch_tensors:
48-
if isinstance(t, theano.compile.sharedvalue.SharedVariable):
49-
t_ = t.type()
50-
tensors.append(t_)
51-
givens.update({t: t_})
52-
else:
53-
tensors.append(t)
54-
55-
return tensors, givens
56-
5741
def _get_rvss(
5842
minibatch_RVs, local_RVs, observed_RVs, minibatch_tensors, total_size):
5943
"""Returns local_RVs and observed_RVs.
@@ -141,8 +125,7 @@ def _make_logpt(global_RVs, local_RVs, observed_RVs, model):
141125

142126
return logpt
143127

144-
def _elbo_t_new(logp, uw_g, uw_l, inarray_g, inarray_l,
145-
n_mcsamples, random_seed):
128+
def _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l, n_mcsamples, random_seed):
146129
"""Return expression of approximate ELBO based on Monte Carlo sampling.
147130
"""
148131
r = MRG_RandomStreams(seed=random_seed)
@@ -203,8 +186,8 @@ def _elbo_t_new(logp, uw_g, uw_l, inarray_g, inarray_l,
203186
def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1,
204187
minibatch_RVs=None, minibatch_tensors=None, minibatches=None,
205188
local_RVs=None, observed_RVs=None, encoder_params=[],
206-
total_size=None, scales=None, learning_rate=.001, epsilon=.1,
207-
random_seed=20090425, verbose=1):
189+
total_size=None, scales=None, optimizer=None, learning_rate=.001,
190+
epsilon=.1, random_seed=20090425, verbose=1, ):
208191
"""Run mini-batch ADVI.
209192
210193
minibatch_tensors and minibatches should be in the same order.
@@ -231,10 +214,16 @@ def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1,
231214
random variables in `minibatch_tensors`.
232215
total_size : int
233216
Total size of training samples.
217+
optimizer : (loss, tensor) -> dict or OrderedDict
218+
A function that returns parameter updates given loss and parameter
219+
tensor. If :code:`None` (default), a default Adagrad optimizer is
220+
used with parameters :code:`learning_rate` and :code:`epsilon` below.
234221
learning_rate: float
235-
Adagrad base learning rate.
222+
Base learning rate for adagrad. This parameter is ignored when
223+
optimizer is given.
236224
epsilon : float
237-
Offset in denominator of the scale of learning rate in Adagrad.
225+
Offset in denominator of the scale of learning rate in Adagrad.
226+
This parameter is ignored when optimizer is given.
238227
random_seed : int
239228
Seed to initialize random state.
240229
@@ -251,6 +240,10 @@ def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1,
251240
check_discrete_rvs(vars)
252241
_check_minibatches(minibatch_tensors, minibatches)
253242

243+
# Prepare optimizer
244+
if optimizer is None:
245+
optimizer = adagrad_optimizer(learning_rate, epsilon)
246+
254247
# For backward compatibility in how input arguments are given
255248
local_RVs, observed_RVs = _get_rvss(minibatch_RVs, local_RVs, observed_RVs,
256249
minibatch_tensors, total_size)
@@ -277,8 +270,8 @@ def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1,
277270
replace = replace_g
278271
if replace_l is not None: replace.update(replace_l)
279272
logp = theano.clone(logpt, replace, strict=False)
280-
elbo = _elbo_t_new(logp, uw_g, uw_l, inarray_g, inarray_l,
281-
n_mcsamples, random_seed)
273+
elbo = _elbo_t(logp, uw_g, uw_l, inarray_g, inarray_l,
274+
n_mcsamples, random_seed)
282275
del logpt
283276

284277
# Variational parameters for global RVs
@@ -309,11 +302,12 @@ def advi_minibatch(vars=None, start=None, model=None, n=5000, n_mcsamples=1,
309302
params = [uw_global_shared] + encoder_params
310303
updates = OrderedDict()
311304
for param in params:
312-
g = tt.grad(elbo, wrt=param)
313-
updates.update(adagrad(g, param, learning_rate, epsilon, n=10))
305+
# g = tt.grad(elbo, wrt=param)
306+
# updates.update(adagrad(g, param, learning_rate, epsilon, n=10))
307+
updates.update(optimizer(loss=-1 * elbo, param=param))
314308
f = theano.function(tensors, elbo, updates=updates)
315309

316-
# Training loop
310+
# Optimization loop
317311
elbos = np.empty(n)
318312
for i in range(n):
319313
e = f(*next(minibatches))

0 commit comments

Comments
 (0)