pymc-devs
diff --git a/‎RELEASE-NOTES.md
Lines changed: 1 addition & 0 deletions b/‎RELEASE-NOTES.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎docs/source/notebooks/model_averaging.ipynb
Lines changed: 54 additions & 85 deletions b/‎docs/source/notebooks/model_averaging.ipynb
Lines changed: 54 additions & 85 deletions
diff --git a/‎docs/source/notebooks/model_comparison.ipynb
Lines changed: 114 additions & 102 deletions b/‎docs/source/notebooks/model_comparison.ipynb
Lines changed: 114 additions & 102 deletions
diff --git a/‎pymc3/stats.py
Lines changed: 12 additions & 9 deletions b/‎pymc3/stats.py
Lines changed: 12 additions & 9 deletions
diff --git a/‎pymc3/tests/test_stats.py
Lines changed: 14 additions & 8 deletions b/‎pymc3/tests/test_stats.py
Lines changed: 14 additions & 8 deletions
@@ -18,6 +18,7 @@
 - Densityplot: add support for discrete variables
 - Fix the Binomial likelihood in `.glm.families.Binomial`, with the flexibility of specifying the `n`. 
 - Add `offset` kwarg to `.glm`.
+- Changed the `compare` function to accept a dictionary of model-trace pairs instead of two separate lists of models and traces.
 
 ### Fixes
 
 
@@ -456,7 +456,7 @@ def _gpinv(p, k, sigma):
     return x
 
 
-def compare(traces, models, ic='WAIC', method='stacking', b_samples=1000,
+def compare(model_dict, ic='WAIC', method='stacking', b_samples=1000,
             alpha=1, seed=None, round_to=2):
     R"""Compare models based on the widely available information criterion (WAIC)
     or leave-one-out (LOO) cross-validation.
@@ -465,9 +465,7 @@ def compare(traces, models, ic='WAIC', method='stacking', b_samples=1000,
 
     Parameters
     ----------
-    traces : list of PyMC3 traces
-    models : list of PyMC3 models
-        in the same order as traces.
+    model_dict : dictionary of PyMC3 traces indexed by corresponding model
     ic : string
         Information Criterion (WAIC or LOO) used to compare models.
         Default WAIC.
@@ -520,23 +518,28 @@ def compare(traces, models, ic='WAIC', method='stacking', b_samples=1000,
     warning : A value of 1 indicates that the computation of the IC may not be
         reliable. Details see the related warning message in pm.waic and pm.loo
     """
+
+    names = [model.name for model in model_dict if model.name]
+    if not names:
+        names = np.arange(len(model_dict))
+
     if ic == 'WAIC':
         ic_func = waic
-        df_comp = pd.DataFrame(index=np.arange(len(models)),
+        df_comp = pd.DataFrame(index=names,
                                columns=['WAIC', 'pWAIC', 'dWAIC', 'weight',
                                         'SE', 'dSE', 'var_warn'])
 
     elif ic == 'LOO':
         ic_func = loo
-        df_comp = pd.DataFrame(index=np.arange(len(models)),
+        df_comp = pd.DataFrame(index=names,
                                columns=['LOO', 'pLOO', 'dLOO', 'weight',
                                         'SE', 'dSE', 'shape_warn'])
 
     else:
         raise NotImplementedError(
             'The information criterion {} is not supported.'.format(ic))
 
-    if len(set([len(m.observed_RVs) for m in models])) != 1:
+    if len(set([len(m.observed_RVs) for m in model_dict])) != 1:
         raise ValueError(
             'The number of observed RVs should be the same across all models')
 
@@ -545,8 +548,8 @@ def compare(traces, models, ic='WAIC', method='stacking', b_samples=1000,
                          'is not supported.'.format(method))
 
     ics = []
-    for c, (t, m) in enumerate(zip(traces, models)):
-        ics.append((c, ic_func(t, m, pointwise=True)))
+    for n, (m, t) in zip(names, model_dict.items()):
+        ics.append((n, ic_func(t, m, pointwise=True)))
 
     ics.sort(key=lambda x: x[1][0])
 
 
@@ -13,6 +13,7 @@
 from numpy.random import random, normal
 from numpy.testing import assert_equal, assert_almost_equal, assert_array_almost_equal
 from scipy import stats as st
+import copy
 
 
 def test_log_post_trace():
@@ -67,12 +68,14 @@ def test_compare():
         x = pm.StudentT('x', nu=1, mu=mu, lam=1, observed=x_obs)
         trace2 = pm.sample(1000)
 
-    traces = [trace0] * 2
-    models = [model0] * 2
+    traces = [trace0, copy.copy(trace0)]
+    models = [model0, copy.copy(model0)]
 
-    w_st = pm.compare(traces, models, method='stacking')['weight']
-    w_bb_bma = pm.compare(traces, models, method='BB-pseudo-BMA')['weight']
-    w_bma = pm.compare(traces, models, method='pseudo-BMA')['weight']
+    model_dict = dict(zip(models, traces))
+
+    w_st = pm.compare(model_dict, method='stacking')['weight']
+    w_bb_bma = pm.compare(model_dict, method='BB-pseudo-BMA')['weight']
+    w_bma = pm.compare(model_dict, method='pseudo-BMA')['weight']
 
     assert_almost_equal(w_st[0], w_st[1])
     assert_almost_equal(w_bb_bma[0], w_bb_bma[1])
@@ -84,9 +87,12 @@ def test_compare():
 
     traces = [trace0, trace1, trace2]
     models = [model0, model1, model2]
-    w_st = pm.compare(traces, models, method='stacking')['weight']
-    w_bb_bma = pm.compare(traces, models, method='BB-pseudo-BMA')['weight']
-    w_bma = pm.compare(traces, models, method='pseudo-BMA')['weight']
+
+    model_dict = dict(zip(models, traces))
+    
+    w_st = pm.compare(model_dict, method='stacking')['weight']
+    w_bb_bma = pm.compare(model_dict, method='BB-pseudo-BMA')['weight']
+    w_bma = pm.compare(model_dict, method='pseudo-BMA')['weight']
 
     assert(w_st[0] > w_st[1] > w_st[2])
     assert(w_bb_bma[0] > w_bb_bma[1] > w_bb_bma[2])