Allow X to be a shared variable (#21)

aloctavodia · web-flow · commit fdba010820df · 2022-10-28T07:36:10.000-03:00
* pps

* pass shared variable

* clean

* fix tests

* check Variable

* avoid reshaping list of trees

* add test

* fix test

* remove comments
diff --git a/pymc_bart/bart.py b/pymc_bart/bart.py
@@ -14,15 +14,20 @@
 #   See the License for the specific language governing permissions and
 #   limitations under the License.
 
+from multiprocessing import Manager
 import aesara.tensor as at
 import numpy as np
 
 from aeppl.logprob import _logprob
 from aesara.tensor.random.op import RandomVariable
+from aesara.tensor.var import Variable
+
 from pandas import DataFrame, Series
 
 from pymc.distributions.distribution import Distribution, _moment
 
+from .utils import sample_posterior
+
 __all__ = ["BART"]
 
 
@@ -34,16 +39,24 @@ class BARTRV(RandomVariable):
     ndims_params = [2, 1, 0, 0, 1]
     dtype = "floatX"
     _print_name = ("BART", "\\operatorname{BART}")
+    all_trees = None
 
     def _supp_shape_from_params(self, dist_params, rep_param_idx=1, param_shapes=None):
-        return (self.X.shape[0],)
+        if isinstance(self.X, Variable):
+            shape = self.X.shape[0].eval()
+        else:
+            shape = self.X.shape[0]
+        return (shape,)
 
     @classmethod
-    def rng_fn(cls, rng, X, Y, m, alpha, split_prior, size):
-        if size is not None:
-            return np.full((size[0], cls.Y.shape[0]), cls.Y.mean())
+    def rng_fn(cls, rng=None, X=None, Y=None, m=None, alpha=None, split_prior=None, size=None):
+        if not cls.all_trees:
+            if size is not None:
+                return np.full((size[0], cls.Y.shape[0]), cls.Y.mean())
+            else:
+                return np.full(cls.Y.shape[0], cls.Y.mean())
         else:
-            return np.full(cls.Y.shape[0], cls.Y.mean())
+            return sample_posterior(cls.all_trees, cls.X)
 
 
 bart = BARTRV()
@@ -69,7 +82,7 @@ class BART(Distribution):
     split_prior : array-like
         Each element of split_prior should be in the [0, 1] interval and the elements should sum to
         1. Otherwise they will be normalized.
-        Defaults to None, i.e. all covariates have the same prior probability to be selected.
+        Defaults to 0, i.e. all covariates have the same prior probability to be selected.
     """
 
     def __new__(
@@ -82,17 +95,20 @@ def __new__(
         split_prior=None,
         **kwargs,
     ):
+        manager = Manager()
+        cls.all_trees = manager.list()
 
         X, Y = preprocess_xy(X, Y)
 
         if split_prior is None:
-            split_prior = np.ones(X.shape[1])
+            split_prior = []
 
         bart_op = type(
             f"BART_{name}",
             (BARTRV,),
             dict(
                 name="BART",
+                all_trees=cls.all_trees,
                 inplace=False,
                 initval=Y.mean(),
                 X=X,
@@ -142,8 +158,10 @@ def preprocess_xy(X, Y):
         Y = Y.to_numpy()
     if isinstance(X, (Series, DataFrame)):
         X = X.to_numpy()
+
     Y = Y.astype(float)
     X = X.astype(float)
+
     return X, Y
 
 
diff --git a/pymc_bart/pgbart.py b/pymc_bart/pgbart.py
@@ -17,14 +17,17 @@
 from copy import deepcopy
 from numba import njit
 
-import aesara
 import numpy as np
 
 from aesara import function as aesara_function
+from aesara import config
+from aesara.tensor.var import Variable
+
 from pymc.model import modelcontext
 from pymc.step_methods.arraystep import ArrayStepShared, Competence
 from pymc.aesaraf import inputvars, join_nonshared_inputs, make_shared_replacements
 
+
 from pymc_bart.bart import BARTRV
 from pymc_bart.tree import LeafNode, SplitNode, Tree
 
@@ -53,7 +56,7 @@ class PGBART(ArrayStepShared):
     name = "pgbart"
     default_blocked = False
     generates_stats = True
-    stats_dtypes = [{"variable_inclusion": object, "bart_trees": object}]
+    stats_dtypes = [{"variable_inclusion": object}]
 
     def __init__(
         self,
@@ -72,7 +75,11 @@ def __init__(
         value_bart = vars[0]
         self.bart = model.values_to_rvs[value_bart].owner.op
 
-        self.X = self.bart.X
+        if isinstance(self.bart.X, Variable):
+            self.X = self.bart.X.eval()
+        else:
+            self.X = self.bart.X
+
         self.Y = self.bart.Y
         self.missing_data = np.any(np.isnan(self.X))
         self.m = self.bart.m
@@ -83,7 +90,11 @@ def __init__(
         else:
             self.shape = shape[0]
 
-        self.alpha_vec = self.bart.split_prior
+        # self.alpha_vec = self.bart.split_prior
+        if self.bart.split_prior:
+            self.alpha_vec = self.bart.split_prior
+        else:
+            self.alpha_vec = np.ones(self.X.shape[1])
         self.init_mean = self.Y.mean()
         # if data is binary
         y_unique = np.unique(self.Y)
@@ -98,7 +109,7 @@ def __init__(
         self.available_predictors = list(range(self.num_variates))
 
         self.sum_trees = np.full((self.shape, self.Y.shape[0]), self.init_mean).astype(
-            aesara.config.floatX
+            config.floatX
         )
         self.sum_trees_noi = self.sum_trees - (self.init_mean / self.m)
         self.a_tree = Tree(
@@ -200,7 +211,10 @@ def astep(self, _):
                 for index in used_variates:
                     variable_inclusion[index] += 1
 
-        stats = {"variable_inclusion": variable_inclusion, "bart_trees": self.all_trees}
+        if not self.tune:
+            self.bart.all_trees.append(self.all_trees)
+
+        stats = {"variable_inclusion": variable_inclusion}
         return self.sum_trees, [stats]
 
     def normalize(self, particles):
@@ -261,7 +275,7 @@ def systematic(self, normalized_weights):
         Note: adapted from https://github.com/nchopin/particles
         """
         lnw = len(normalized_weights)
-        single_uniform = (self.uniform.random() + np.arange(lnw)) / lnw
+        single_uniform = (self.uniform.random()[0] + np.arange(lnw)) / lnw
         return inverse_cdf(single_uniform, normalized_weights) + 2
 
     def init_particles(self, tree_id: int) -> np.ndarray:
diff --git a/pymc_bart/tree.py b/pymc_bart/tree.py
@@ -16,7 +16,7 @@
 
 from copy import deepcopy
 
-import aesara
+from aesara import config
 import numpy as np
 
 
@@ -59,7 +59,7 @@ def __init__(self, leaf_node_value, idx_data_points, num_observations, shape):
             0: LeafNode(index=0, value=leaf_node_value, idx_data_points=idx_data_points)
         }
         self.idx_leaf_nodes = [0]
-        self.output = np.zeros((num_observations, shape)).astype(aesara.config.floatX).squeeze()
+        self.output = np.zeros((num_observations, shape)).astype(config.floatX).squeeze()
 
     def __getitem__(self, index):
         return self.get_node(index)
diff --git a/pymc_bart/utils.py b/pymc_bart/utils.py
@@ -4,20 +4,21 @@
 import matplotlib.pyplot as plt
 import numpy as np
 
+from aesara.tensor.var import Variable
 from numpy.random import RandomState
 from scipy.interpolate import griddata
 from scipy.signal import savgol_filter
 from scipy.stats import pearsonr
 
 
-def predict(idata, rng, X, size=None, excluded=None):
+def predict(bartrv, rng, X, size=None, excluded=None):
     """
     Generate samples from the BART-posterior.
 
     Parameters
     ----------
-    idata : InferenceData
-        InferenceData containing a collection of BART_trees in sample_stats group
+    bartrv : BART Random Variable
+        BART variable once the model that include it has been fitted.
     rng: NumPy random generator
     X : array-like
         A covariate matrix. Use the same used to fit BART for in-sample predictions or a new one for
@@ -27,8 +28,10 @@ def predict(idata, rng, X, size=None, excluded=None):
     excluded : list
         indexes of the variables to exclude when computing predictions
     """
-    bart_trees = idata.sample_stats.bart_trees
-    stacked_trees = bart_trees.stack(trees=["chain", "draw"])
+    stacked_trees = bartrv.owner.op.all_trees
+    if isinstance(X, Variable):
+        X = X.eval()
+
     if size is None:
         size = ()
     elif isinstance(size, int):
@@ -38,20 +41,49 @@ def predict(idata, rng, X, size=None, excluded=None):
     for s in size:
         flatten_size *= s
 
-    idx = rng.randint(len(stacked_trees.trees), size=flatten_size)
-    shape = stacked_trees.isel(trees=0).values[0].predict(X[0]).size
+    idx = rng.randint(len(stacked_trees), size=flatten_size)
+    shape = stacked_trees[0][0].predict(X[0]).size
 
     pred = np.zeros((flatten_size, X.shape[0], shape))
 
     for ind, p in enumerate(pred):
-        for tree in stacked_trees.isel(trees=idx[ind]).values:
+        for tree in stacked_trees[idx[ind]]:
             p += np.array([tree.predict(x, excluded) for x in X])
     pred.reshape((*size, shape, -1))
     return pred
 
 
+def sample_posterior(all_trees, X):
+    """
+    Generate samples from the BART-posterior.
+
+    Parameters
+    ----------
+    all_trees : list
+        List of all trees sampled from a posterior
+    X : array-like
+        A covariate matrix. Use the same used to fit BART for in-sample predictions or a new one for
+        out-of-sample predictions.
+    m : int
+        Number of trees
+    """
+    stacked_trees = all_trees
+    idx = np.random.randint(len(stacked_trees))
+    if isinstance(X, Variable):
+        X = X.eval()
+
+    shape = stacked_trees[0][0].predict(X[0]).size
+
+    pred = np.zeros((1, X.shape[0], shape))
+
+    for p in pred:
+        for tree in stacked_trees[idx]:
+            p += np.array([tree.predict(x) for x in X])
+    return pred.squeeze()
+
+
 def plot_dependence(
-    idata,
+    bartrv,
     X,
     Y=None,
     kind="pdp",
@@ -79,8 +111,8 @@ def plot_dependence(
 
     Parameters
     ----------
-    idata: InferenceData
-        InferenceData containing a collection of BART_trees in sample_stats group
+    bartrv : BART Random Variable
+        BART variable once the model that include it has been fitted.
     X : array-like
         The covariate matrix.
     Y : array-like
@@ -149,6 +181,9 @@ def plot_dependence(
 
     rng = RandomState(seed=random_seed)
 
+    if isinstance(X, Variable):
+        X = X.eval()
+
     if hasattr(X, "columns") and hasattr(X, "values"):
         x_names = list(X.columns)
         X = X.values
@@ -207,13 +242,13 @@ def plot_dependence(
             for x_i in new_x_i:
                 new_X[:, indices_mi] = X[:, indices_mi]
                 new_X[:, i] = x_i
-                y_pred.append(np.mean(predict(idata, rng, X=new_X, size=samples), 1))
+                y_pred.append(np.mean(predict(bartrv, rng, X=new_X, size=samples), 1))
             new_x_target.append(new_x_i)
         else:
             for instance in instances:
                 new_X = X[idx_s]
                 new_X[:, indices_mi] = X[:, indices_mi][instance]
-                y_pred.append(np.mean(predict(idata, rng, X=new_X, size=samples), 0))
+                y_pred.append(np.mean(predict(bartrv, rng, X=new_X, size=samples), 0))
             new_x_target.append(new_X[:, i])
         y_mins.append(np.min(y_pred))
         new_y.append(np.array(y_pred).T)
@@ -310,7 +345,7 @@ def plot_dependence(
 
 
 def plot_variable_importance(
-    idata, X, labels=None, sort_vars=True, figsize=None, samples=100, random_seed=None
+    idata, bartrv, X, labels=None, sort_vars=True, figsize=None, samples=100, random_seed=None
 ):
     """
     Estimates variable importance from the BART-posterior.
@@ -319,6 +354,8 @@ def plot_variable_importance(
     ----------
     idata: InferenceData
         InferenceData containing a collection of BART_trees in sample_stats group
+    bartrv : BART Random Variable
+        BART variable once the model that include it has been fitted.
     X : array-like
         The covariate matrix.
     labels : list
@@ -365,12 +402,12 @@ def plot_variable_importance(
     axes[0].set_xlabel("covariables")
     axes[0].set_ylabel("importance")
 
-    predicted_all = predict(idata, rng, X=X, size=samples, excluded=None)
+    predicted_all = predict(bartrv, rng, X=X, size=samples, excluded=None)
 
     ev_mean = np.zeros(len(var_imp))
     ev_hdi = np.zeros((len(var_imp), 2))
     for idx, subset in enumerate(subsets):
-        predicted_subset = predict(idata, rng, X=X, size=samples, excluded=subset)
+        predicted_subset = predict(bartrv, rng, X=X, size=samples, excluded=subset)
         pearson = np.zeros(samples)
         for j in range(samples):
             pearson[j] = (
diff --git a/tests/test_bart.py b/tests/test_bart.py