add beta-vae (#3071)

ferrine · web-flow · commit 10c9330e4c55 · 2018-07-05T14:48:11.000+03:00
* add betavee, get strange test fail

* add test backward compat

*  docs

* improve docs, increment release notes
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
@@ -25,13 +25,15 @@
   final trace.
 - Add `model_to_graphviz` (which uses the optional dependency `graphviz`) to
   plot a directed graph of a PyMC3 model using plate notation.
+- Add beta-ELBO variational inference as in beta-VAE model (Christopher P. Burgess et al. NIPS, 2017)
 
 ### Fixes
 
 - Fixed `KeyError` raised when only subset of variables are specified to be recorded in the trace.
 - Removed unused `repeat=None` arguments from all `random()` methods in distributions.
 - Deprecated the `sigma` argument in `MarginalSparse.marginal_likelihood` in favor of `noise`
 - Fixed unexpected behavior in `random`. Now the `random` functionality is more robust and will work better for `sample_prior` when that is implemented.
+- Fixed `scale_cost_to_minibatch` behaviour, previously this was not working and always `False`
 
 ## PyMC 3.4.1 (April 18 2018)
 
diff --git a/pymc3/model.py b/pymc3/model.py
@@ -737,7 +737,14 @@ def varlogpt(self):
         """Theano scalar of log-probability of the unobserved random variables
            (excluding deterministic)."""
         with self:
-            factors = [var.logpt for var in self.vars]
+            factors = [var.logpt for var in self.free_RVs]
+            return tt.sum(factors)
+
+    @property
+    def datalogpt(self):
+        with self:
+            factors = [var.logpt for var in self.observed_RVs]
+            factors += [tt.sum(factor) for factor in self.potentials]
             return tt.sum(factors)
 
     @property
diff --git a/pymc3/tests/test_variational_inference.py b/pymc3/tests/test_variational_inference.py
@@ -459,6 +459,93 @@ def test_elbo():
     np.testing.assert_allclose(elbo_mc, elbo_true, rtol=0, atol=1e-1)
 
 
+@pytest.mark.parametrize(
+    'aux_total_size',
+    range(2, 10, 3)
+)
+def test_scale_cost_to_minibatch_works(aux_total_size):
+    mu0 = 1.5
+    sigma = 1.0
+    y_obs = np.array([1.6, 1.4])
+    beta = len(y_obs)/float(aux_total_size)
+    post_mu = np.array([1.88], dtype=theano.config.floatX)
+    post_sd = np.array([1], dtype=theano.config.floatX)
+
+    # TODO: theano_config
+    # with pm.Model(theano_config=dict(floatX='float64')):
+    # did not not work as expected
+    # there were some numeric problems, so float64 is forced
+    with pm.theanof.change_flags(floatX='float64', warn_float64='ignore'):
+        with pm.Model():
+            assert theano.config.floatX == 'float64'
+            assert theano.config.warn_float64 == 'ignore'
+            mu = pm.Normal('mu', mu=mu0, sd=sigma)
+            pm.Normal('y', mu=mu, sd=1, observed=y_obs, total_size=aux_total_size)
+            # Create variational gradient tensor
+            mean_field_1 = MeanField()
+            assert mean_field_1.scale_cost_to_minibatch
+            mean_field_1.shared_params['mu'].set_value(post_mu)
+            mean_field_1.shared_params['rho'].set_value(np.log(np.exp(post_sd) - 1))
+
+            with pm.theanof.change_flags(compute_test_value='off'):
+                elbo_via_total_size_scaled = -pm.operators.KL(mean_field_1)()(10000)
+
+        with pm.Model():
+            mu = pm.Normal('mu', mu=mu0, sd=sigma)
+            pm.Normal('y', mu=mu, sd=1, observed=y_obs, total_size=aux_total_size)
+            # Create variational gradient tensor
+            mean_field_2 = MeanField()
+            assert mean_field_1.scale_cost_to_minibatch
+            mean_field_2.scale_cost_to_minibatch = False
+            assert not mean_field_2.scale_cost_to_minibatch
+            mean_field_2.shared_params['mu'].set_value(post_mu)
+            mean_field_2.shared_params['rho'].set_value(np.log(np.exp(post_sd) - 1))
+
+        with pm.theanof.change_flags(compute_test_value='off'):
+            elbo_via_total_size_unscaled = -pm.operators.KL(mean_field_2)()(10000)
+
+        np.testing.assert_allclose(elbo_via_total_size_unscaled.eval(),
+                                   elbo_via_total_size_scaled.eval() * pm.floatX(1 / beta), rtol=0.02, atol=1e-1)
+
+
+@pytest.mark.parametrize(
+    'aux_total_size',
+    range(2, 10, 3)
+)
+def test_elbo_beta_kl(aux_total_size):
+    mu0 = 1.5
+    sigma = 1.0
+    y_obs = np.array([1.6, 1.4])
+    beta = len(y_obs)/float(aux_total_size)
+    post_mu = np.array([1.88], dtype=theano.config.floatX)
+    post_sd = np.array([1], dtype=theano.config.floatX)
+    with pm.theanof.change_flags(floatX='float64', warn_float64='ignore'):
+        with pm.Model():
+            mu = pm.Normal('mu', mu=mu0, sd=sigma)
+            pm.Normal('y', mu=mu, sd=1, observed=y_obs, total_size=aux_total_size)
+            # Create variational gradient tensor
+            mean_field_1 = MeanField()
+            mean_field_1.scale_cost_to_minibatch = True
+            mean_field_1.shared_params['mu'].set_value(post_mu)
+            mean_field_1.shared_params['rho'].set_value(np.log(np.exp(post_sd) - 1))
+
+            with pm.theanof.change_flags(compute_test_value='off'):
+                elbo_via_total_size_scaled = -pm.operators.KL(mean_field_1)()(10000)
+
+        with pm.Model():
+            mu = pm.Normal('mu', mu=mu0, sd=sigma)
+            pm.Normal('y', mu=mu, sd=1, observed=y_obs)
+            # Create variational gradient tensor
+            mean_field_3 = MeanField()
+            mean_field_3.shared_params['mu'].set_value(post_mu)
+            mean_field_3.shared_params['rho'].set_value(np.log(np.exp(post_sd) - 1))
+
+            with pm.theanof.change_flags(compute_test_value='off'):
+                elbo_via_beta_kl = -pm.operators.KL(mean_field_3, beta=beta)()(10000)
+
+        np.testing.assert_allclose(elbo_via_total_size_scaled.eval(), elbo_via_beta_kl.eval(), rtol=0, atol=1e-1)
+
+
 @pytest.fixture(
     'module',
     params=[True, False],
@@ -581,6 +668,8 @@ def fit_kwargs(inference, use_minibatch):
     }
     if use_minibatch:
         key = 'mini'
+        # backward compat for PR#3071
+        inference.approx.scale_cost_to_minibatch = False
     else:
         key = 'full'
     return _select[(type(inference), key)]
diff --git a/pymc3/variational/inference.py b/pymc3/variational/inference.py
@@ -22,6 +22,7 @@
     'FullRankADVI',
     'SVGD',
     'ASVGD',
+    'NFVI',
     'Inference',
     'ImplicitGradient',
     'KLqp',
@@ -272,15 +273,28 @@ class KLqp(Inference):
     """**Kullback Leibler Divergence Inference**
 
     General approach to fit Approximations that define :math:`logq`
-    by maximizing ELBO (Evidence Lower Bound).
+    by maximizing ELBO (Evidence Lower Bound). In some cases
+    rescaling the regularization term KL may be beneficial
+
+    .. math::
+
+        ELBO_\beta = \log p(D|\theta) - \beta KL(q||p)
 
     Parameters
     ----------
     approx : :class:`Approximation`
         Approximation to fit, it is required to have `logQ`
+    beta : float
+        Scales the regularization term in ELBO (see Christopher P. Burgess et al., 2017)
+
+    References
+    ----------
+    -   Christopher P. Burgess et al. (NIPS, 2017)
+        Understanding disentangling in :math:`\beta`-VAE
+        arXiv preprint 1804.03599
     """
-    def __init__(self, approx):
-        super(KLqp, self).__init__(KL, approx, None)
+    def __init__(self, approx, beta=1.):
+        super(KLqp, self).__init__(KL, approx, None, beta=beta)
 
 
 class ADVI(KLqp):
diff --git a/pymc3/variational/operators.py b/pymc3/variational/operators.py
@@ -14,13 +14,33 @@
 class KL(Operator):
     R"""**Operator based on Kullback Leibler Divergence**
 
+    This operator constructs Evidence Lower Bound (ELBO) objective
+
+    .. math::
+
+        ELBO_\beta = \log p(D|\theta) - \beta KL(q||p)
+
+    where
+
     .. math::
 
         KL[q(v)||p(v)] = \int q(v)\log\frac{q(v)}{p(v)}dv
+
+
+    Parameters
+    ----------
+    approx : :class:`Approximation`
+        Approximation used for inference
+    beta : float
+        Beta parameter for KL divergence, scales the regularization term.
     """
 
+    def __init__(self, approx, beta=1.):
+        Operator.__init__(self, approx)
+        self.beta = pm.floatX(beta)
+
     def apply(self, f):
-        return self.logq_norm - self.logp_norm
+        return -self.datalogp_norm + self.beta * (self.logq_norm - self.varlogp_norm)
 
 # SVGD Implementation
 
@@ -76,6 +96,8 @@ class KSD(Operator):
     ----------
     approx : :class:`Approximation`
         Approximation used for inference
+    temperature: float
+        Temperature for Stein gradient
 
     References
     ----------
diff --git a/pymc3/variational/opvi.py b/pymc3/variational/opvi.py
@@ -389,8 +389,12 @@ def __init__(self, approx):
 
     inputs = property(lambda self: self.approx.inputs)
     logp = property(lambda self: self.approx.logp)
+    varlogp = property(lambda self: self.approx.varlogp)
+    datalogp = property(lambda self: self.approx.datalogp)
     logq = property(lambda self: self.approx.logq)
     logp_norm = property(lambda self: self.approx.logp_norm)
+    varlogp_norm = property(lambda self: self.approx.varlogp_norm)
+    datalogp_norm = property(lambda self: self.approx.datalogp_norm)
     logq_norm = property(lambda self: self.approx.logq_norm)
     model = property(lambda self: self.approx.model)
 
@@ -1298,7 +1302,10 @@ def symbolic_normalizing_constant(self):
         """*Dev* - normalizing constant for `self.logq`, scales it to `minibatch_size` instead of `total_size`.
         Here the effect is controlled by `self.scale_cost_to_minibatch`
         """
-        t = tt.max(self.collect('symbolic_normalizing_constant'))
+        t = tt.max(
+            self.collect('symbolic_normalizing_constant') + [
+                var.scaling for var in self.model.observed_RVs
+            ])
         t = tt.switch(self._scale_cost_to_minibatch, t,
                       tt.constant(1, dtype=t.dtype))
         return pm.floatX(t)
@@ -1318,28 +1325,83 @@ def logq_norm(self):
         """*Dev* - collects `logQ` for all groups and normalizes it"""
         return self.logq / self.symbolic_normalizing_constant
 
+    @node_property
+    def _sized_symbolic_varlogp_and_datalogp(self):
+        """*Dev* - computes sampled prior term from model via `theano.scan`"""
+        varlogp_s, datalogp_s = self.symbolic_sample_over_posterior(
+            [self.model.varlogpt, self.model.datalogpt])
+        return varlogp_s, datalogp_s  # both shape (s,)
+
+    @node_property
+    def sized_symbolic_varlogp(self):
+        """*Dev* - computes sampled prior term from model via `theano.scan`"""
+        return self._sized_symbolic_varlogp_and_datalogp[0]  # shape (s,)
+
+    @node_property
+    def sized_symbolic_datalogp(self):
+        """*Dev* - computes sampled data term from model via `theano.scan`"""
+        return self._sized_symbolic_varlogp_and_datalogp[1]  # shape (s,)
+
     @node_property
     def sized_symbolic_logp(self):
-        """*Dev* - computes sampled `logP` from model via `theano.scan`"""
-        free_logp_local = self.symbolic_sample_over_posterior(self.model.logpt)
-        return free_logp_local  # shape (s,)
+        """*Dev* - computes sampled logP from model via `theano.scan`"""
+        return self.sized_symbolic_varlogp + self.sized_symbolic_datalogp  # shape (s,)
 
     @node_property
     def logp(self):
         """*Dev* - computes :math:`E_{q}(logP)` from model via `theano.scan` that can be optimized later"""
-        return self.sized_symbolic_logp.mean(0)
+        return self.varlogp + self.datalogp
+
+    @node_property
+    def varlogp(self):
+        """*Dev* - computes :math:`E_{q}(prior term)` from model via `theano.scan` that can be optimized later"""
+        return self.sized_symbolic_varlogp.mean(0)
+
+    @node_property
+    def datalogp(self):
+        """*Dev* - computes :math:`E_{q}(data term)` from model via `theano.scan` that can be optimized later"""
+        return self.sized_symbolic_datalogp.mean(0)
+
+    @node_property
+    def _single_symbolic_varlogp_and_datalogp(self):
+        """*Dev* - computes sampled prior term from model via `theano.scan`"""
+        varlogp, datalogp = self.symbolic_single_sample(
+            [self.model.varlogpt, self.model.datalogpt])
+        return varlogp, datalogp
+
+    @node_property
+    def single_symbolic_varlogp(self):
+        """*Dev* - for single MC sample estimate of :math:`E_{q}(prior term)` `theano.scan`
+        is not needed and code can be optimized"""
+        return self._single_symbolic_varlogp_and_datalogp[0]
+
+    @node_property
+    def single_symbolic_datalogp(self):
+        """*Dev* - for single MC sample estimate of :math:`E_{q}(data term)` `theano.scan`
+        is not needed and code can be optimized"""
+        return self._single_symbolic_varlogp_and_datalogp[1]
 
     @node_property
     def single_symbolic_logp(self):
         """*Dev* - for single MC sample estimate of :math:`E_{q}(logP)` `theano.scan`
         is not needed and code can be optimized"""
-        return self.symbolic_single_sample(self.model.logpt)
+        return self.single_symbolic_datalogp + self.single_symbolic_varlogp
 
     @node_property
     def logp_norm(self):
         """*Dev* - normalized :math:`E_{q}(logP)`"""
         return self.logp / self.symbolic_normalizing_constant
 
+    @node_property
+    def varlogp_norm(self):
+        """*Dev* - normalized :math:`E_{q}(prior term)`"""
+        return self.varlogp / self.symbolic_normalizing_constant
+
+    @node_property
+    def datalogp_norm(self):
+        """*Dev* - normalized :math:`E_{q}(data term)`"""
+        return self.datalogp / self.symbolic_normalizing_constant
+
     @property
     def replacements(self):
         """*Dev* - all replacements from groups to replace PyMC random variables with approximation"""
@@ -1437,7 +1499,8 @@ def get_optimization_replacements(self, s, d):
         repl = collections.OrderedDict()
         # avoid scan if size is constant and equal to one
         if isinstance(s, int) and (s == 1) or s is None:
-            repl[self.logp] = self.single_symbolic_logp
+            repl[self.varlogp] = self.single_symbolic_varlogp
+            repl[self.datalogp] = self.single_symbolic_datalogp
         return repl
 
     @change_flags(compute_test_value='off')