Skip to content

Commit 636bddb

Browse files
authored
Merge pull request #1294 from taku-y/advi_opt
ENH Arbitrary optimizer for ADVI
2 parents 641c0f5 + bc38179 commit 636bddb

File tree

3 files changed

+145
-124
lines changed

3 files changed

+145
-124
lines changed

pymc3/tests/test_advi.py

Lines changed: 35 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
from pymc3 import Model, Normal, DiscreteUniform, Poisson, switch, Exponential
44
from pymc3.theanof import inputvars
55
from pymc3.variational import advi, advi_minibatch, sample_vp
6-
from pymc3.variational.advi import variational_gradient_estimate
6+
from pymc3.variational.advi import _calc_elbo, adagrad_optimizer
7+
from pymc3.theanof import CallableTensor
78
from theano import function, shared
89
import theano.tensor as tt
910

@@ -17,20 +18,21 @@ def test_elbo():
1718
# Create a model for test
1819
with Model() as model:
1920
mu = Normal('mu', mu=mu0, sd=sigma)
20-
y = Normal('y', mu=mu, sd=1, observed=y_obs)
21+
Normal('y', mu=mu, sd=1, observed=y_obs)
2122

2223
vars = inputvars(model.vars)
2324

2425
# Create variational gradient tensor
25-
grad, elbo, shared, uw = variational_gradient_estimate(
26-
vars, model, n_mcsamples=10000, random_seed=1)
26+
elbo, _ = _calc_elbo(vars, model, n_mcsamples=10000, random_seed=1)
2727

2828
# Variational posterior parameters
2929
uw_ = np.array([1.88, np.log(1)])
3030

3131
# Calculate elbo computed with MonteCarlo
32-
f = function([uw], elbo)
33-
elbo_mc = f(uw_)
32+
uw_shared = shared(uw_, 'uw_shared')
33+
elbo = CallableTensor(elbo)(uw_shared)
34+
f = function([], elbo)
35+
elbo_mc = f()
3436

3537
# Exact value
3638
elbo_true = (-0.5 * (
@@ -107,7 +109,7 @@ def test_advi():
107109

108110
with Model() as model:
109111
mu_ = Normal('mu', mu=mu0, sd=sd0, testval=0)
110-
x = Normal('x', mu=mu_, sd=sd, observed=data)
112+
Normal('x', mu=mu_, sd=sd, observed=data)
111113

112114
advi_fit = advi(
113115
model=model, n=1000, accurate_elbo=False, learning_rate=1e-1,
@@ -120,6 +122,32 @@ def test_advi():
120122
np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.4)
121123
np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
122124

125+
def test_advi_optimizer():
126+
n = 1000
127+
sd0 = 2.
128+
mu0 = 4.
129+
sd = 3.
130+
mu = -5.
131+
132+
data = sd * np.random.RandomState(0).randn(n) + mu
133+
134+
d = n / sd**2 + 1 / sd0**2
135+
mu_post = (n * np.mean(data) / sd**2 + mu0 / sd0**2) / d
136+
137+
with Model() as model:
138+
mu_ = Normal('mu', mu=mu0, sd=sd0, testval=0)
139+
Normal('x', mu=mu_, sd=sd, observed=data)
140+
141+
optimizer = adagrad_optimizer(learning_rate=0.1, epsilon=0.1)
142+
advi_fit = advi(model=model, n=1000, optimizer=optimizer, random_seed=1)
143+
144+
np.testing.assert_allclose(advi_fit.means['mu'], mu_post, rtol=0.1)
145+
146+
trace = sample_vp(advi_fit, 10000, model)
147+
148+
np.testing.assert_allclose(np.mean(trace['mu']), mu_post, rtol=0.4)
149+
np.testing.assert_allclose(np.std(trace['mu']), np.sqrt(1. / d), rtol=0.4)
150+
123151
def test_advi_minibatch():
124152
n = 1000
125153
sd0 = 2.

pymc3/variational/advi.py

Lines changed: 88 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from ..vartypes import discrete_types
1010

1111
import theano
12-
from ..theanof import make_shared_replacements, join_nonshared_inputs, CallableTensor, gradient
12+
from ..theanof import make_shared_replacements, join_nonshared_inputs, CallableTensor
1313
from theano.tensor import exp, dvector
1414
import theano.tensor as tt
1515
from theano.sandbox.rng_mrg import MRG_RandomStreams
@@ -29,7 +29,8 @@ def check_discrete_rvs(vars):
2929
raise ValueError('Model should not include discrete RVs for ADVI.')
3030

3131
def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
32-
learning_rate=.001, epsilon=.1, random_seed=20090425, verbose=1):
32+
optimizer=None, learning_rate=.001, epsilon=.1, random_seed=20090425,
33+
verbose=1):
3334
"""Perform automatic differentiation variational inference (ADVI).
3435
3536
This function implements the meanfield ADVI, where the variational
@@ -49,10 +50,12 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
4950
transformed space, while traces returned by :code:`sample_vp()` are in
5051
the original space as obtained by MCMC sampling methods in PyMC3.
5152
52-
The variational parameters are optimized with a modified version of
53-
Adagrad, where only the last (n_window) gradient vectors are used to
54-
control the learning rate and older gradient vectors are ignored.
55-
n_window denotes the size of time window and fixed to 10.
53+
The variational parameters are optimized with given optimizer, which is a
54+
function that returns a dictionary of parameter updates as provided to
55+
Theano function. If no optimizer is provided, optimization is performed
56+
with a modified version of adagrad, where only the last (n_window) gradient
57+
vectors are used to control the learning rate and older gradient vectors
58+
are ignored. n_window denotes the size of time window and fixed to 10.
5659
5760
Parameters
5861
----------
@@ -66,10 +69,16 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
6669
Number of interations updating parameters.
6770
accurate_elbo : bool
6871
If true, 100 MC samples are used for accurate calculation of ELBO.
72+
optimizer : (loss, tensor) -> dict or OrderedDict
73+
A function that returns parameter updates given loss and parameter
74+
tensor. If :code:`None` (default), a default Adagrad optimizer is
75+
used with parameters :code:`learning_rate` and :code:`epsilon` below.
6976
learning_rate: float
70-
Adagrad base learning rate.
77+
Base learning rate for adagrad. This parameter is ignored when
78+
optimizer is given.
7179
epsilon : float
7280
Offset in denominator of the scale of learning rate in Adagrad.
81+
This parameter is ignored when optimizer is given.
7382
random_seed : int
7483
Seed to initialize random state.
7584
@@ -99,9 +108,13 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
99108

100109
n_mcsamples = 100 if accurate_elbo else 1
101110

111+
# Prepare optimizer
112+
if optimizer is None:
113+
optimizer = adagrad_optimizer(learning_rate, epsilon)
114+
102115
# Create variational gradient tensor
103-
grad, elbo, shared, _ = variational_gradient_estimate(
104-
vars, model, n_mcsamples=n_mcsamples, random_seed=random_seed)
116+
elbo, shared = _calc_elbo(vars, model, n_mcsamples=n_mcsamples,
117+
random_seed=random_seed)
105118

106119
# Set starting values
107120
for var, share in shared.items():
@@ -113,51 +126,16 @@ def advi(vars=None, start=None, model=None, n=5000, accurate_elbo=False,
113126
w_start = np.zeros_like(u_start)
114127
uw = np.concatenate([u_start, w_start])
115128

116-
result, elbos = run_adagrad(uw, grad, elbo, n, learning_rate=learning_rate, epsilon=epsilon, verbose=verbose)
117-
118-
l = int(result.size / 2)
119-
120-
u = bij.rmap(result[:l])
121-
w = bij.rmap(result[l:])
122-
# w is in log space
123-
for var in w.keys():
124-
w[var] = np.exp(w[var])
125-
return ADVIFit(u, w, elbos)
126-
127-
def replace_shared_minibatch_tensors(minibatch_tensors):
128-
"""Replace shared variables in minibatch tensors with normal tensors.
129-
"""
130-
givens = dict()
131-
tensors = list()
132-
133-
for t in minibatch_tensors:
134-
if isinstance(t, theano.compile.sharedvalue.SharedVariable):
135-
t_ = t.type()
136-
tensors.append(t_)
137-
givens.update({t: t_})
138-
else:
139-
tensors.append(t)
140-
141-
return tensors, givens
142-
143-
def run_adagrad(uw, grad, elbo, n, learning_rate=.001, epsilon=.1, verbose=1):
144-
"""Run Adagrad parameter update.
145-
146-
This function is only used in batch training.
147-
"""
148-
shared_inarray = theano.shared(uw, 'uw_shared')
149-
grad = CallableTensor(grad)(shared_inarray)
150-
elbo = CallableTensor(elbo)(shared_inarray)
151-
152-
updates = adagrad(grad, shared_inarray, learning_rate=learning_rate, epsilon=epsilon, n=10)
129+
# Create parameter update function used in the training loop
130+
uw_shared = theano.shared(uw, 'uw_shared')
131+
elbo = CallableTensor(elbo)(uw_shared)
132+
updates = optimizer(loss=-1 * elbo, param=uw_shared)
133+
f = theano.function([], [uw_shared, elbo], updates=updates)
153134

154-
# Create in-place update function
155-
f = theano.function([], [shared_inarray, grad, elbo], updates=updates)
156-
157-
# Run adagrad steps
135+
# Optimization loop
158136
elbos = np.empty(n)
159137
for i in range(n):
160-
uw_i, g, e = f()
138+
uw_i, e = f()
161139
elbos[i] = e
162140
if verbose and not i % (n//10):
163141
if not i:
@@ -169,24 +147,24 @@ def run_adagrad(uw, grad, elbo, n, learning_rate=.001, epsilon=.1, verbose=1):
169147
if verbose:
170148
avg_elbo = elbos[-n//10:].mean()
171149
print('Finished [100%]: Average ELBO = {}'.format(avg_elbo.round(2)))
172-
return uw_i, elbos
173150

174-
def variational_gradient_estimate(
175-
vars, model, minibatch_RVs=[], minibatch_tensors=[], total_size=None,
176-
n_mcsamples=1, random_seed=20090425):
177-
"""Calculate approximate ELBO and its (stochastic) gradient.
178-
"""
151+
# Estimated parameters
152+
l = int(uw_i.size / 2)
153+
u = bij.rmap(uw_i[:l])
154+
w = bij.rmap(uw_i[l:])
155+
# w is in log space
156+
for var in w.keys():
157+
w[var] = np.exp(w[var])
158+
159+
return ADVIFit(u, w, elbos)
179160

161+
def _calc_elbo(vars, model, n_mcsamples, random_seed):
162+
"""Calculate approximate ELBO.
163+
"""
180164
theano.config.compute_test_value = 'ignore'
181165
shared = make_shared_replacements(vars, model)
182166

183-
# Correction sample size
184-
r = 1 if total_size is None else \
185-
float(total_size) / minibatch_tensors[0].shape[0]
186-
187-
other_RVs = set(model.basic_RVs) - set(minibatch_RVs)
188-
factors = [r * var.logpt for var in minibatch_RVs] + \
189-
[var.logpt for var in other_RVs] + model.potentials
167+
factors = [var.logpt for var in model.basic_RVs] + model.potentials
190168
logpt = tt.add(*map(tt.sum, factors))
191169

192170
[logp], inarray = join_nonshared_inputs([logpt], vars, shared)
@@ -195,14 +173,11 @@ def variational_gradient_estimate(
195173
uw.tag.test_value = np.concatenate([inarray.tag.test_value,
196174
inarray.tag.test_value])
197175

198-
elbo = elbo_t(logp, uw, inarray, n_mcsamples=n_mcsamples, random_seed=random_seed)
199-
200-
# Gradient
201-
grad = gradient(elbo, [uw])
176+
elbo = _elbo_t(logp, uw, inarray, n_mcsamples, random_seed)
202177

203-
return grad, elbo, shared, uw
178+
return elbo, shared
204179

205-
def elbo_t(logp, uw, inarray, n_mcsamples, random_seed):
180+
def _elbo_t(logp, uw, inarray, n_mcsamples, random_seed):
206181
"""Create Theano tensor of approximate ELBO by Monte Carlo sampling.
207182
"""
208183
l = (uw.size/2).astype('int64')
@@ -229,26 +204,50 @@ def elbo_t(logp, uw, inarray, n_mcsamples, random_seed):
229204

230205
return elbo
231206

232-
def adagrad(grad, param, learning_rate, epsilon, n):
233-
"""Create Theano parameter (tensor) updates by Adagrad.
207+
def adagrad_optimizer(learning_rate, epsilon, n_win=10):
208+
"""Returns a function that returns parameter updates.
209+
210+
Parameter
211+
---------
212+
learning_rate : float
213+
Learning rate.
214+
epsilon : float
215+
Offset to avoid zero-division in the normalizer of adagrad.
216+
n_win : int
217+
Number of past steps to calculate scales of parameter gradients.
218+
219+
Returns
220+
-------
221+
A function (loss, param) -> updates.
222+
223+
loss : Theano scalar
224+
Loss function to be minimized (e.g., negative ELBO).
225+
param : Theano tensor
226+
Parameters to be optimized.
227+
updates : OrderedDict
228+
Parameter updates used in Theano functions.
234229
"""
235-
# Compute windowed adagrad using last n gradients
236-
i = theano.shared(np.array(0), 'i')
237-
value = param.get_value(borrow=True)
238-
accu = theano.shared(np.zeros(value.shape+(n,), dtype=value.dtype))
239-
240-
# Append squared gradient vector to accu_new
241-
accu_new = tt.set_subtensor(accu[:,i], grad ** 2)
242-
i_new = tt.switch((i + 1) < n, i + 1, 0)
243-
244-
updates = OrderedDict()
245-
updates[accu] = accu_new
246-
updates[i] = i_new
247-
248-
accu_sum = accu_new.sum(axis=1)
249-
updates[param] = param - (-learning_rate * grad /
250-
tt.sqrt(accu_sum + epsilon))
251-
return updates
230+
def optimizer(loss, param):
231+
i = theano.shared(np.array(0))
232+
value = param.get_value(borrow=True)
233+
accu = theano.shared(np.zeros(value.shape+(n_win,), dtype=value.dtype))
234+
# grad = tt.grad(loss, wrt=param)
235+
grad = tt.grad(loss, param)
236+
237+
# Append squared gradient vector to accu_new
238+
accu_new = tt.set_subtensor(accu[:,i], grad ** 2)
239+
i_new = tt.switch((i + 1) < n_win, i + 1, 0)
240+
241+
updates = OrderedDict()
242+
updates[accu] = accu_new
243+
updates[i] = i_new
244+
245+
accu_sum = accu_new.sum(axis=1)
246+
updates[param] = param - (learning_rate * grad /
247+
tt.sqrt(accu_sum + epsilon))
248+
249+
return updates
250+
return optimizer
252251

253252
def sample_vp(
254253
vparams, draws=1000, model=None, local_RVs=None, random_seed=20090425,

0 commit comments

Comments
 (0)