Add offset to glm models (#2889)

Junpeng Lao · web-flow · commit e72b0d567c2a · 2018-03-04T16:37:39.000+01:00
* Add `offset` kwarg to glm

with test

* add release note

and clean ups

* minor potential fail safe

* set default to 0.
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
@@ -17,24 +17,12 @@
 - Add logitnormal distribution
 - Densityplot: add support for discrete variables
 - Fix the Binomial likelihood in `.glm.families.Binomial`, with the flexibility of specifying the `n`. 
+- Add `offset` kwarg to `.glm`.
 
 ### Fixes
 
 - `VonMises` does not overflow for large values of kappa. i0 and i1 have been removed and we now use log_i0 to compute the logp.
-- The bandwidth for KDE plots is computed using a modified version of Scott's rule. The new version uses entropy instead of standard
-deviation. This works better for multimodal distributions. Functions using KDE plots has a new argument `bw` controlling the bandwidth.
-
-=======
-- Add `logit_p` keyword to `pm.Bernoulli`, so that users can specify the logit
-  of the success probability. This is faster and more stable than using
-  `p=tt.nnet.sigmoid(logit_p)`.
-- Add `random` keyword to `pm.DensityDist` thus enabling users to pass custom random method
-  which in turn makes sampling from a `DensityDist` possible. 
-
-### Fixes
-
-- `VonMises` does not overflow for large values of kappa. i0 and i1 have been removed and we now use
-   log_i0 to compute the logp.
+- The bandwidth for KDE plots is computed using a modified version of Scott's rule. The new version uses entropy instead of standard deviation. This works better for multimodal distributions. Functions using KDE plots has a new argument `bw` controlling the bandwidth.
 
 ### Deprecations
 
diff --git a/pymc3/glm/linear.py b/pymc3/glm/linear.py
@@ -28,12 +28,15 @@ class LinearComponent(Model):
         use `Regressor` key for defining default prior for all regressors
             defaults to Normal.dist(mu=0, tau=1.0E-6)
     vars : dict - random variables instead of creating new ones
+    offset : scalar, or numpy/theano array with the same shape as y
+        this can be used to specify an a priori known component to be
+        included in the linear predictor during fitting.
     """
     default_regressor_prior = Normal.dist(mu=0, tau=1.0E-6)
     default_intercept_prior = Flat.dist()
 
     def __init__(self, x, y, intercept=True, labels=None,
-                 priors=None, vars=None, name='', model=None):
+                 priors=None, vars=None, name='', model=None, offset=0.):
         super(LinearComponent, self).__init__(name, model)
         if priors is None:
             priors = {}
@@ -77,17 +80,17 @@ def __init__(self, x, y, intercept=True, labels=None,
                     )
                 coeffs.append(v)
         self.coeffs = tt.stack(coeffs, axis=0)
-        self.y_est = x.dot(self.coeffs)
+        self.y_est = x.dot(self.coeffs) + offset
 
     @classmethod
     def from_formula(cls, formula, data, priors=None, vars=None,
-                     name='', model=None):
+                     name='', model=None, offset=0.):
         import patsy
         y, x = patsy.dmatrices(formula, data)
         labels = x.design_info.column_names
         return cls(np.asarray(x), np.asarray(y)[:, -1], intercept=False,
                    labels=labels, priors=priors, vars=vars, name=name,
-                   model=model)
+                   model=model, offset=offset)
 
 
 class GLM(LinearComponent):
@@ -108,12 +111,17 @@ class GLM(LinearComponent):
     init : dict - test_vals for coefficients
     vars : dict - random variables instead of creating new ones
     family : pymc3..families object
+    offset : scalar, or numpy/theano array with the same shape as y
+        this can be used to specify an a priori known component to be
+        included in the linear predictor during fitting.
     """
     def __init__(self, x, y, intercept=True, labels=None,
-                 priors=None, vars=None, family='normal', name='', model=None):
+                 priors=None, vars=None, family='normal', name='',
+                 model=None, offset=0.):
         super(GLM, self).__init__(
             x, y, intercept=intercept, labels=labels,
-            priors=priors, vars=vars, name=name, model=model
+            priors=priors, vars=vars, name=name, 
+            model=model, offset=offset
         )
 
         _families = dict(
@@ -131,13 +139,14 @@ def __init__(self, x, y, intercept=True, labels=None,
 
     @classmethod
     def from_formula(cls, formula, data, priors=None,
-                     vars=None, family='normal', name='', model=None):
+                     vars=None, family='normal', name='',
+                     model=None, offset=0.):
         import patsy
         y, x = patsy.dmatrices(formula, data)
         labels = x.design_info.column_names
         return cls(np.asarray(x), np.asarray(y)[:, -1], intercept=False,
                    labels=labels, priors=priors, vars=vars, family=family,
-                   name=name, model=model)
+                   name=name, model=model, offset=offset)
 
 
 glm = GLM
diff --git a/pymc3/stats.py b/pymc3/stats.py
@@ -887,10 +887,6 @@ def summary(trace, varnames=None, transform=lambda x: x, stat_funcs=None,
         samples. Defaults to the smaller of 100 or the number of samples.
         This is only meaningful when `stat_funcs` is None.
 
-    See also
-    --------
-    summary : Generate a pretty-printed summary of a trace.
-
     Returns
     -------
     `pandas.DataFrame` with summary statistics for each variable Defaults one
diff --git a/pymc3/tests/test_glm.py b/pymc3/tests/test_glm.py
@@ -58,6 +58,16 @@ def test_glm(self):
             assert round(abs(np.mean(trace['x'])-self.slope), 1) == 0
             assert round(abs(np.mean(trace['sd'])-self.sd), 1) == 0
 
+    def test_glm_offset(self):
+        offset = 1.
+        with Model() as model:
+            GLM.from_formula('y ~ x', self.data_linear, offset=offset)
+            step = Slice(model.vars)
+            trace = sample(500, step=step, tune=0, progressbar=False,
+                           random_seed=self.random_seed)
+
+            assert round(abs(np.mean(trace['Intercept'])-self.intercept+offset), 1) == 0
+
     def test_glm_link_func(self):
         with Model() as model:
             GLM.from_formula('y ~ x', self.data_logistic,