Change nuts init default to jitter+adapt_diag (#2475)

aseyboldt · Junpeng Lao · commit ed2aa49745d7 · 2017-08-10T16:56:12.000+02:00
diff --git a/pymc3/sampling.py b/pymc3/sampling.py
@@ -123,11 +123,15 @@ def sample(draws=500, step=None, init='auto', n_init=200000, start=None,
         Initialization method to use for auto-assigned NUTS samplers.
 
         * auto : Choose a default initialization method automatically.
-          Currently, this is `'advi+adapt_diag'`, but this can change in
+          Currently, this is `'unif+adapt_diag'`, but this can change in
           the future. If you depend on the exact behaviour, choose an
           initialization method explicitly.
         * adapt_diag : Start with a identity mass matrix and then adapt
-          a diagonal based on the variance of the tuning samples.
+          a diagonal based on the variance of the tuning samples. All
+          chains use the test value (usually the prior mean) as starting
+          point.
+        * jitter+adapt_diag : Same as `adapt_diag`, but add uniform jitter
+          in [-1, 1] to the starting point in each chain.
         * advi+adapt_diag : Run ADVI and then adapt the resulting diagonal
           mass matrix based on the sample variance of the tuning samples.
         * advi+adapt_diag_grad : Run ADVI and then adapt the resulting
@@ -695,11 +699,15 @@ def init_nuts(init='auto', njobs=1, n_init=500000, model=None,
         Initialization method to use.
 
         * auto : Choose a default initialization method automatically.
-          Currently, this is `'advi+adapt_diag'`, but this can change in
+          Currently, this is `'unif+adapt_diag'`, but this can change in
           the future. If you depend on the exact behaviour, choose an
           initialization method explicitly.
         * adapt_diag : Start with a identity mass matrix and then adapt
-          a diagonal based on the variance of the tuning samples.
+          a diagonal based on the variance of the tuning samples. All
+          chains use the test value (usually the prior mean) as starting
+          point.
+        * jitter+adapt_diag : Same as `adapt_diag`, but add uniform jitter
+          in [-1, 1] to the starting point in each chain.
         * advi+adapt_diag : Run ADVI and then adapt the resulting diagonal
           mass matrix based on the sample variance of the tuning samples.
         * advi+adapt_diag_grad : Run ADVI and then adapt the resulting
@@ -746,7 +754,7 @@ def init_nuts(init='auto', njobs=1, n_init=500000, model=None,
         init = init.lower()
 
     if init == 'auto':
-        init = 'advi+adapt_diag'
+        init = 'jitter+adapt_diag'
 
     pm._log.info('Initializing NUTS using {}...'.format(init))
 
@@ -767,6 +775,19 @@ def init_nuts(init='auto', njobs=1, n_init=500000, model=None,
             model.ndim, mean, var, 10)
         if njobs == 1:
             start = start[0]
+    elif init == 'jitter+adapt_diag':
+        start = []
+        for _ in range(njobs):
+            mean = {var: val.copy() for var, val in model.test_point.items()}
+            for val in mean.values():
+                val[...] += 2 * np.random.rand(*val.shape) - 1
+            start.append(mean)
+        mean = np.mean([model.dict_to_array(vals) for vals in start], axis=0)
+        var = np.ones_like(mean)
+        potential = quadpotential.QuadPotentialDiagAdapt(
+            model.ndim, mean, var, 10)
+        if njobs == 1:
+            start = start[0]
     elif init == 'advi+adapt_diag_grad':
         approx = pm.fit(
             random_seed=random_seed,
diff --git a/pymc3/tests/test_sampling.py b/pymc3/tests/test_sampling.py
@@ -253,8 +253,8 @@ def test_sum_normal(self):
 
 
 @pytest.mark.parametrize('method', [
-    'adapt_diag', 'advi', 'ADVI+adapt_diag', 'advi+adapt_diag_grad',
-    'map', 'advi_map', 'nuts'
+    'jitter+adapt_diag', 'adapt_diag', 'advi', 'ADVI+adapt_diag',
+    'advi+adapt_diag_grad', 'map', 'advi_map', 'nuts'
 ])
 def test_exec_nuts_init(method):
     with pm.Model() as model: