Add logp_nojac and logp_sum (#2499)

aseyboldt · Junpeng Lao · commit 866c4821b70f · 2017-08-16T14:38:06.000+02:00
* Add logp_nojac and logp_sum

* Add model name to name of logp variable
diff --git a/pymc3/distributions/distribution.py b/pymc3/distributions/distribution.py
@@ -91,6 +91,28 @@ def _repr_latex_(self, name=None, dist=None):
         """Magic method name for IPython to use for LaTeX formatting."""
         return None
 
+    def logp_nojac(self, *args, **kwargs):
+        """Return the logp, but do not include a jacobian term for transforms.
+
+        If we use different parametrizations for the same distribution, we
+        need to add the determinant of the jacobian of the transformation
+        to make sure the densities still describe the same distribution.
+        However, MAP estimates are not invariant with respect to the
+        parametrization, we need to exclude the jacobian terms in this case.
+
+        This function should be overwritten in base classes for transformed
+        distributions.
+        """
+        return self.logp(*args, **kwargs)
+
+    def logp_sum(self, *args, **kwargs):
+        """Return the sum of the logp values for the given observations.
+
+        Subclasses can use this to improve the speed of logp evaluations
+        if only the sum of the logp values is needed.
+        """
+        return tt.sum(self.logp(*args, **kwargs))
+
     __latex__ = _repr_latex_
 
 
diff --git a/pymc3/distributions/transforms.py b/pymc3/distributions/transforms.py
@@ -80,6 +80,9 @@ def logp(self, x):
         return (self.dist.logp(self.transform_used.backward(x)) +
                 self.transform_used.jacobian_det(x))
 
+    def logp_nojac(self, x):
+        return self.dist.logp(self.transform_used.backward(x))
+
 transform = Transform
 
 
diff --git a/pymc3/model.py b/pymc3/model.py
@@ -172,6 +172,18 @@ def d2logp(self, vars=None):
         """Compiled log probability density hessian function"""
         return self.model.fn(hessian(self.logpt, vars))
 
+    @property
+    def logp_nojac(self):
+        return self.model.fn(self.logp_nojact)
+
+    def dlogp_nojac(self, vars=None):
+        """Compiled log density gradient function, without jacobian terms."""
+        return self.model.fn(gradient(self.logp_nojact, vars))
+
+    def d2logp_nojac(self, vars=None):
+        """Compiled log density hessian function, without jacobian terms."""
+        return self.model.fn(hessian(self.logp_nojact, vars))
+
     @property
     def fastlogp(self):
         """Compiled log probability density function"""
@@ -185,13 +197,36 @@ def fastd2logp(self, vars=None):
         """Compiled log probability density hessian function"""
         return self.model.fastfn(hessian(self.logpt, vars))
 
+    @property
+    def fastlogp_nojac(self):
+        return self.model.fastfn(self.logp_nojact)
+
+    def fastdlogp_nojac(self, vars=None):
+        """Compiled log density gradient function, without jacobian terms."""
+        return self.model.fastfn(gradient(self.logp_nojact, vars))
+
+    def fastd2logp_nojac(self, vars=None):
+        """Compiled log density hessian function, without jacobian terms."""
+        return self.model.fastfn(hessian(self.logp_nojact, vars))
+
     @property
     def logpt(self):
         """Theano scalar of log-probability of the model"""
         if getattr(self, 'total_size', None) is not None:
-            logp = tt.sum(self.logp_elemwiset) * self.scaling
+            logp = self.logp_sum_unscaledt * self.scaling
+        else:
+            logp = self.logp_sum_unscaledt
+        if self.name is not None:
+            logp.name = '__logp_%s' % self.name
+        return logp
+
+    @property
+    def logp_nojact(self):
+        """Theano scalar of log-probability, excluding jacobian terms."""
+        if getattr(self, 'total_size', None) is not None:
+            logp = tt.sum(self.logp_nojac_unscaledt) * self.scaling
         else:
-            logp = tt.sum(self.logp_elemwiset)
+            logp = tt.sum(self.logp_nojac_unscaledt)
         if self.name is not None:
             logp.name = '__logp_%s' % self.name
         return logp
@@ -626,9 +661,26 @@ def logp_dlogp_function(self, grad_vars=None, **kwargs):
     def logpt(self):
         """Theano scalar of log-probability of the model"""
         with self:
-            factors = [var.logpt for var in self.basic_RVs] + self.potentials
-            logp = tt.add(*map(tt.sum, factors))
-            logp.name = '__logp'
+            factors = [var.logpt for var in self.basic_RVs]
+            logp_factors = tt.sum(factors)
+            logp_potentials = tt.sum([tt.sum(pot) for pot in self.potentials])
+            logp = logp_factors + logp_potentials
+            if self.name:
+                logp.name = '__logp_%s' % self.name
+            else:
+                logp.name = '__logp'
+            return logp
+
+    @property
+    def logp_nojact(self):
+        """Theano scalar of log-probability of the model"""
+        with self:
+            factors = [var.logp_nojact for var in self.basic_RVs] + self.potentials
+            logp = tt.sum([tt.sum(factor) for factor in factors])
+            if self.name:
+                logp.name = '__logp_nojac_%s' % self.name
+            else:
+                logp.name = '__logp_nojac'
             return logp
 
     @property
@@ -637,7 +689,7 @@ def varlogpt(self):
            (excluding deterministic)."""
         with self:
             factors = [var.logpt for var in self.vars]
-            return tt.add(*map(tt.sum, factors))
+            return tt.sum(factors)
 
     @property
     def vars(self):
@@ -1069,6 +1121,10 @@ def __init__(self, type=None, owner=None, index=None, name=None,
             self.tag.test_value = np.ones(
                 distribution.shape, distribution.dtype) * distribution.default()
             self.logp_elemwiset = distribution.logp(self)
+            # The logp might need scaling in minibatches.
+            # This is done in `Factor`.
+            self.logp_sum_unscaledt = distribution.logp_sum(self)
+            self.logp_nojac_unscaledt = distribution.logp_nojac(self)
             self.total_size = total_size
             self.model = model
             self.scaling = _get_scaling(total_size, self.shape, self.ndim)
@@ -1172,6 +1228,10 @@ def __init__(self, type=None, owner=None, index=None, name=None, data=None,
 
             self.missing_values = data.missing_values
             self.logp_elemwiset = distribution.logp(data)
+            # The logp might need scaling in minibatches.
+            # This is done in `Factor`.
+            self.logp_sum_unscaledt = distribution.logp_sum(data)
+            self.logp_nojac_unscaledt = distribution.logp_nojac(data)
             self.total_size = total_size
             self.model = model
             self.distribution = distribution
@@ -1223,6 +1283,10 @@ def __init__(self, name, data, distribution, total_size=None, model=None):
         self.missing_values = [datum.missing_values for datum in self.data.values()
                                if datum.missing_values is not None]
         self.logp_elemwiset = distribution.logp(**self.data)
+        # The logp might need scaling in minibatches.
+        # This is done in `Factor`.
+        self.logp_sum_unscaledt = distribution.logp_sum(**self.data)
+        self.logp_nojac_unscaledt = distribution.logp_nojac(**self.data)
         self.total_size = total_size
         self.model = model
         self.distribution = distribution
diff --git a/pymc3/tuning/starting.py b/pymc3/tuning/starting.py
@@ -18,8 +18,10 @@
 
 __all__ = ['find_MAP']
 
+
 def find_MAP(start=None, vars=None, fmin=None,
-             return_raw=False, model=None, live_disp=False, callback=None, *args, **kwargs):
+             return_raw=False, model=None, live_disp=False, callback=None,
+             *args, **kwargs):
     """
     Sets state to the local maximum a posteriori point given a model.
     Current default of fmin_Hessian does not deal well with optimizing close
@@ -69,7 +71,7 @@ def find_MAP(start=None, vars=None, fmin=None,
     except AttributeError:
         gradient_avail = False
 
-    if disc_vars or not gradient_avail :
+    if disc_vars or not gradient_avail:
         pm._log.warning("Warning: gradient not available." +
                         "(E.g. vars contains discrete variables). MAP " +
                         "estimates may not be accurate for the default " +
@@ -88,13 +90,13 @@ def find_MAP(start=None, vars=None, fmin=None,
     start = Point(start, model=model)
     bij = DictToArrayBijection(ArrayOrdering(vars), start)
 
-    logp = bij.mapf(model.fastlogp)
+    logp = bij.mapf(model.fastlogp_nojac)
     def logp_o(point):
         return nan_to_high(-logp(point))
 
     # Check to see if minimization function actually uses the gradient
     if 'fprime' in getargspec(fmin).args:
-        dlogp = bij.mapf(model.fastdlogp(vars))
+        dlogp = bij.mapf(model.fastdlogp_nojac(vars))
         def grad_logp_o(point):
             return nan_to_num(-dlogp(point))