Clean utils (#23)

aloctavodia · web-flow · commit f32bc75f87dc · 2022-10-28T18:04:51.000-03:00
* clean utils

* remove comments

* remove unused import
diff --git a/pymc_bart/__init__.py b/pymc_bart/__init__.py
@@ -15,7 +15,7 @@
 
 from pymc_bart.bart import BART
 from pymc_bart.pgbart import PGBART
-from pymc_bart.utils import plot_dependence, plot_variable_importance, predict
+from pymc_bart.utils import plot_dependence, plot_variable_importance
 
 __all__ = ["BART", "PGBART"]
 __version__ = "0.1.0"
diff --git a/pymc_bart/bart.py b/pymc_bart/bart.py
@@ -26,7 +26,7 @@
 
 from pymc.distributions.distribution import Distribution, _moment
 
-from .utils import sample_posterior
+from .utils import _sample_posterior
 
 __all__ = ["BART"]
 
@@ -56,7 +56,7 @@ def rng_fn(cls, rng=None, X=None, Y=None, m=None, alpha=None, split_prior=None,
             else:
                 return np.full(cls.Y.shape[0], cls.Y.mean())
         else:
-            return sample_posterior(cls.all_trees, cls.X)
+            return _sample_posterior(cls.all_trees, cls.X, rng=rng).squeeze()
 
 
 bart = BARTRV()
diff --git a/pymc_bart/utils.py b/pymc_bart/utils.py
@@ -5,30 +5,29 @@
 import numpy as np
 
 from aesara.tensor.var import Variable
-from numpy.random import RandomState
 from scipy.interpolate import griddata
 from scipy.signal import savgol_filter
 from scipy.stats import pearsonr
 
 
-def predict(bartrv, rng, X, size=None, excluded=None):
+def _sample_posterior(all_trees, X, rng, size=None, excluded=None):
     """
     Generate samples from the BART-posterior.
 
     Parameters
     ----------
-    bartrv : BART Random Variable
-        BART variable once the model that include it has been fitted.
-    rng: NumPy random generator
+    all_trees : list
+        List of all trees sampled from a posterior
     X : array-like
         A covariate matrix. Use the same used to fit BART for in-sample predictions or a new one for
         out-of-sample predictions.
+    rng : NumPy RandomGenerator
     size : int or tuple
         Number of samples.
     excluded : list
-        indexes of the variables to exclude when computing predictions
+        Indexes of the variables to exclude when computing predictions
     """
-    stacked_trees = bartrv.owner.op.all_trees
+    stacked_trees = all_trees
     if isinstance(X, Variable):
         X = X.eval()
 
@@ -41,7 +40,7 @@ def predict(bartrv, rng, X, size=None, excluded=None):
     for s in size:
         flatten_size *= s
 
-    idx = rng.randint(len(stacked_trees), size=flatten_size)
+    idx = rng.integers(0, len(stacked_trees), size=flatten_size)
     shape = stacked_trees[0][0].predict(X[0]).size
 
     pred = np.zeros((flatten_size, X.shape[0], shape))
@@ -53,35 +52,6 @@ def predict(bartrv, rng, X, size=None, excluded=None):
     return pred
 
 
-def sample_posterior(all_trees, X):
-    """
-    Generate samples from the BART-posterior.
-
-    Parameters
-    ----------
-    all_trees : list
-        List of all trees sampled from a posterior
-    X : array-like
-        A covariate matrix. Use the same used to fit BART for in-sample predictions or a new one for
-        out-of-sample predictions.
-    m : int
-        Number of trees
-    """
-    stacked_trees = all_trees
-    idx = np.random.randint(len(stacked_trees))
-    if isinstance(X, Variable):
-        X = X.eval()
-
-    shape = stacked_trees[0][0].predict(X[0]).size
-
-    pred = np.zeros((1, X.shape[0], shape))
-
-    for p in pred:
-        for tree in stacked_trees[idx]:
-            p += np.array([tree.predict(x) for x in X])
-    return pred.squeeze()
-
-
 def plot_dependence(
     bartrv,
     X,
@@ -179,8 +149,6 @@ def plot_dependence(
                           Available option are 'insample', 'linear' or 'quantiles'"""
         )
 
-    rng = RandomState(seed=random_seed)
-
     if isinstance(X, Variable):
         X = X.eval()
 
@@ -195,6 +163,8 @@ def plot_dependence(
     else:
         y_label = "Predicted Y"
 
+    rng = np.random.default_rng(random_seed)
+
     num_covariates = X.shape[1]
 
     indices = list(range(num_covariates))
@@ -216,14 +186,15 @@ def plot_dependence(
         xs_values = [0.05, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.95]
 
     if kind == "ice":
-        instances = np.random.choice(range(X.shape[0]), replace=False, size=instances)
+        instances = rng.choice(range(X.shape[0]), replace=False, size=instances)
 
     new_y = []
     new_x_target = []
     y_mins = []
 
     new_X = np.zeros_like(X)
     idx_s = list(range(X.shape[0]))
+    all_trees = bartrv.owner.op.all_trees
     for i in var_idx:
         indices_mi = indices[:]
         indices_mi.pop(i)
@@ -242,13 +213,17 @@ def plot_dependence(
             for x_i in new_x_i:
                 new_X[:, indices_mi] = X[:, indices_mi]
                 new_X[:, i] = x_i
-                y_pred.append(np.mean(predict(bartrv, rng, X=new_X, size=samples), 1))
+                y_pred.append(
+                    np.mean(_sample_posterior(all_trees, X=new_X, rng=rng, size=samples), 1)
+                )
             new_x_target.append(new_x_i)
         else:
             for instance in instances:
                 new_X = X[idx_s]
                 new_X[:, indices_mi] = X[:, indices_mi][instance]
-                y_pred.append(np.mean(predict(bartrv, rng, X=new_X, size=samples), 0))
+                y_pred.append(
+                    np.mean(_sample_posterior(all_trees, X=new_X, rng=rng, size=samples), 0)
+                )
             new_x_target.append(new_X[:, i])
         y_mins.append(np.min(y_pred))
         new_y.append(np.array(y_pred).T)
@@ -328,7 +303,7 @@ def plot_dependence(
                     nxi,
                     nyi,
                     smooth=smooth,
-                    fill_kwargs={"alpha": alpha},
+                    fill_kwargs={"alpha": alpha, "color": color},
                     ax=ax,
                 )
                 ax.plot(nxi[idx], nyi[idx].mean(0), color=color)
@@ -374,7 +349,6 @@ def plot_variable_importance(
     idxs: indexes of the covariates from higher to lower relative importance
     axes: matplotlib axes
     """
-    rng = RandomState(seed=random_seed)
     _, axes = plt.subplots(2, 1, figsize=figsize)
 
     if hasattr(X, "columns") and hasattr(X, "values"):
@@ -387,6 +361,8 @@ def plot_variable_importance(
     else:
         labels = np.array(labels)
 
+    rng = np.random.default_rng(random_seed)
+
     ticks = np.arange(len(var_imp), dtype=int)
     idxs = np.argsort(var_imp)
     subsets = [idxs[:-i] for i in range(1, len(idxs))]
@@ -402,12 +378,14 @@ def plot_variable_importance(
     axes[0].set_xlabel("covariables")
     axes[0].set_ylabel("importance")
 
-    predicted_all = predict(bartrv, rng, X=X, size=samples, excluded=None)
+    all_trees = bartrv.owner.op.all_trees
+
+    predicted_all = _sample_posterior(all_trees, X=X, rng=rng, size=samples, excluded=None)
 
     ev_mean = np.zeros(len(var_imp))
     ev_hdi = np.zeros((len(var_imp), 2))
     for idx, subset in enumerate(subsets):
-        predicted_subset = predict(bartrv, rng, X=X, size=samples, excluded=subset)
+        predicted_subset = _sample_posterior(all_trees, X=X, rng=rng, size=samples, excluded=subset)
         pearson = np.zeros(samples)
         for j in range(samples):
             pearson[j] = (
diff --git a/tests/test_bart.py b/tests/test_bart.py
@@ -91,11 +91,12 @@ class TestUtils:
         y = pm.Normal("y", mu, sigma, observed=Y)
         idata = pm.sample(random_seed=3415)
 
-    def test_predict(self):
-        rng = RandomState(12345)
-        pred_all = pmb.predict(self.mu, rng, X=self.X, size=2)
-        rng = RandomState(12345)
-        pred_first = pmb.predict(self.mu, rng, X=self.X[:10])
+    def test_sample_posterior(self):
+        all_trees = self.mu.owner.op.all_trees
+        rng = np.random.default_rng(3)
+        pred_all = pmb.utils._sample_posterior(all_trees, X=self.X, rng=rng, size=2)
+        rng = np.random.default_rng(3)
+        pred_first = pmb.utils._sample_posterior(all_trees, X=self.X[:10], rng=rng)
 
         assert_almost_equal(pred_first[0], pred_all[0, :10], decimal=4)
         assert pred_all.shape == (2, 50, 1)