BART with non-gaussian likelihoods (#4675)

aloctavodia · web-flow · commit 842a69666744 · 2021-05-05T02:15:49.000-03:00
* allow unbounded likelihoods, add inv_link and small refactor

* add test

* fix test

* set jitter optional, update realease notes
diff --git a/RELEASE-NOTES.md b/RELEASE-NOTES.md
@@ -5,6 +5,9 @@
 + A deprecation warning from the `semver` package we use for checking backend compatibility was dealt with (see [#4547](https://github.com/pymc-devs/pymc3/pull/4547)).
 + `theano.printing.pydotprint` is now hotfixed upon import (see [#4594](https://github.com/pymc-devs/pymc3/pull/4594)).
 
+### New Features
++ BART with non-gaussian likelihoods (see [#4675](https://github.com/pymc-devs/pymc3/pull/4675)).
+
 ## PyMC3 3.11.2 (14 March 2021)
 
 ### New Features
diff --git a/pymc3/distributions/bart.py b/pymc3/distributions/bart.py
@@ -23,11 +23,24 @@
 
 
 class BaseBART(NoDistribution):
-    def __init__(self, X, Y, m=200, alpha=0.25, split_prior=None, *args, **kwargs):
-
+    def __init__(
+        self,
+        X,
+        Y,
+        m=200,
+        alpha=0.25,
+        split_prior=None,
+        scale=None,
+        inv_link=None,
+        jitter=False,
+        *args,
+        **kwargs,
+    ):
+
+        self.jitter = jitter
         self.X, self.Y, self.missing_data = self.preprocess_XY(X, Y)
 
-        super().__init__(shape=X.shape[0], dtype="float64", testval=0, *args, **kwargs)
+        super().__init__(shape=X.shape[0], dtype="float64", testval=self.Y.mean(), *args, **kwargs)
 
         if self.X.ndim != 2:
             raise ValueError("The design matrix X must have two dimensions")
@@ -48,13 +61,24 @@ def __init__(self, X, Y, m=200, alpha=0.25, split_prior=None, *args, **kwargs):
                 "The value for the alpha parameter for the tree structure "
                 "must be in the interval (0, 1)"
             )
+        self.m = m
+        self.alpha = alpha
+        self.y_std = Y.std()
+
+        if scale is None:
+            self.leaf_scale = NormalSampler(sigma=None)
+        elif isinstance(scale, (int, float)):
+            self.leaf_scale = NormalSampler(sigma=Y.std() / self.m ** scale)
+
+        if inv_link is None:
+            self.inv_link = lambda x: x
+        else:
+            self.inv_link = inv_link
 
         self.num_observations = X.shape[0]
         self.num_variates = X.shape[1]
         self.available_predictors = list(range(self.num_variates))
         self.ssv = SampleSplittingVariable(split_prior, self.num_variates)
-        self.m = m
-        self.alpha = alpha
         self.trees = self.init_list_of_trees()
         self.all_trees = []
         self.mean = fast_mean()
@@ -66,7 +90,9 @@ def preprocess_XY(self, X, Y):
         if isinstance(X, (Series, DataFrame)):
             X = X.to_numpy()
         missing_data = np.any(np.isnan(X))
-        X = np.random.normal(X, np.std(X, 0) / 100)
+        if self.jitter:
+            X = np.random.normal(X, np.nanstd(X, 0) / 100000)
+        Y = Y.astype(float)
         return X, Y, missing_data
 
     def init_list_of_trees(self):
@@ -155,32 +181,27 @@ def get_new_idx_data_points(self, current_split_node, idx_data_points):
 
     def get_residuals(self):
         """Compute the residuals."""
-        R_j = self.Y - self.sum_trees_output
-        return R_j
+        R_j = self.Y - self.inv_link(self.sum_trees_output)
 
-    def get_residuals_loo(self, tree):
-        """Compute the residuals without leaving the passed tree out."""
-        R_j = self.Y - (self.sum_trees_output - tree.predict_output(self.num_observations))
         return R_j
 
     def draw_leaf_value(self, idx_data_points):
-        """ Draw the residual mean."""
+        """Draw the residual mean."""
         R_j = self.get_residuals()[idx_data_points]
-        draw = self.mean(R_j)
+        draw = self.mean(R_j) + self.leaf_scale.random()
         return draw
 
     def predict(self, X_new):
         """Compute out of sample predictions evaluated at X_new"""
         trees = self.all_trees
         num_observations = X_new.shape[0]
         pred = np.zeros((len(trees), num_observations))
-        np.random.randint(len(trees))
         for draw, trees_to_sum in enumerate(trees):
             new_Y = np.zeros(num_observations)
             for tree in trees_to_sum:
                 new_Y += [tree.predict_out_of_sample(x) for x in X_new]
             pred[draw] = new_Y
-        return pred
+        return self.inv_link(pred)
 
 
 def compute_prior_probability(alpha):
@@ -257,6 +278,24 @@ def rvs(self):
                     return i
 
 
+class NormalSampler:
+    def __init__(self, sigma):
+        self.size = 5000
+        self.cache = []
+        self.sigma = sigma
+
+    def random(self):
+        if self.sigma is None:
+            return 0
+        else:
+            if not self.cache:
+                self.update()
+        return self.cache.pop()
+
+    def update(self):
+        self.cache = np.random.normal(loc=0.0, scale=self.sigma, size=self.size).tolist()
+
+
 class BART(BaseBART):
     """
     BART distribution.
@@ -278,10 +317,23 @@ class BART(BaseBART):
         Each element of split_prior should be in the [0, 1] interval and the elements should sum
         to 1. Otherwise they will be normalized.
         Defaults to None, all variable have the same a prior probability
+    scale : float
+        Controls the variance of the proposed leaf value. The leaf values are computed as a
+        Gaussian with mean equal to the conditional residual mean and variance proportional to
+        the variance of the response variable, and inversely proportional to the number of trees
+        and the scale parameter. Defaults to None, i.e the variance is 0.
+    inv_link : numpy function
+        Inverse link function defaults to None, i.e. the identity function.
+    jitter : bool
+        Whether to jitter the X values or not. Defaults to False. When values of X are repeated,
+        jittering X has the effect of increasing the number of effective spliting variables,
+        otherwise it does not have any effect.
     """
 
-    def __init__(self, X, Y, m=200, alpha=0.25, split_prior=None):
-        super().__init__(X, Y, m, alpha, split_prior)
+    def __init__(
+        self, X, Y, m=200, alpha=0.25, split_prior=None, scale=None, inv_link=None, jitter=False
+    ):
+        super().__init__(X, Y, m, alpha, split_prior, scale, inv_link)
 
     def _str_repr(self, name=None, dist=None, formatting="plain"):
         if dist is None:
diff --git a/pymc3/distributions/tree.py b/pymc3/distributions/tree.py
@@ -45,12 +45,13 @@ class Tree:
     tree_id : int, optional
     """
 
-    def __init__(self, tree_id=0):
+    def __init__(self, tree_id=0, num_observations=0):
         self.tree_structure = {}
         self.num_nodes = 0
         self.idx_leaf_nodes = []
         self.idx_prunable_split_nodes = []
         self.tree_id = tree_id
+        self.num_observations = num_observations
 
     def __getitem__(self, index):
         return self.get_node(index)
@@ -77,11 +78,12 @@ def delete_node(self, index):
         del self.tree_structure[index]
         self.num_nodes -= 1
 
-    def predict_output(self, num_observations):
-        output = np.zeros(num_observations)
+    def predict_output(self):
+        output = np.zeros(self.num_observations)
         for node_index in self.idx_leaf_nodes:
             current_node = self.get_node(node_index)
             output[current_node.idx_data_points] = current_node.value
+
         return output
 
     def predict_out_of_sample(self, x):
@@ -163,7 +165,7 @@ def init_tree(tree_id, leaf_node_value, idx_data_points):
         -------
 
         """
-        new_tree = Tree(tree_id)
+        new_tree = Tree(tree_id, len(idx_data_points))
         new_tree[0] = LeafNode(index=0, value=leaf_node_value, idx_data_points=idx_data_points)
         return new_tree
 
diff --git a/pymc3/step_methods/pgbart.py b/pymc3/step_methods/pgbart.py
@@ -86,12 +86,13 @@ def __init__(self, vars=None, num_particles=10, max_stages=5000, chunk="auto", m
 
     def astep(self, _):
         bart = self.bart
+        inv_link = bart.inv_link
         num_observations = bart.num_observations
         variable_inclusion = np.zeros(bart.num_variates, dtype="int")
 
         # For the tunning phase we restrict max_stages to a low number, otherwise it is almost sure
         # we will reach max_stages given that our first set of m trees is not good at all.
-        # Can set max_stages as a function of the number of variables/dimensions?
+        # Can set max_stages as a function of the number of variables/dimensions? XXX
         if self.tune:
             max_stages = 5
         else:
@@ -105,10 +106,11 @@ def astep(self, _):
                 break
             self.idx += 1
             tree = bart.trees[idx]
-            R_j = bart.get_residuals_loo(tree)
+            old_prediction = tree.predict_output()
+            bart.sum_trees_output -= old_prediction
             # Generate an initial set of SMC particles
             # at the end of the algorithm we return one of these particles as the new tree
-            particles = self.init_particles(tree.tree_id, R_j, num_observations)
+            particles = self.init_particles(tree.tree_id, num_observations, inv_link)
 
             for t in range(1, max_stages):
                 # Get old particle at stage t
@@ -119,13 +121,12 @@ def astep(self, _):
                 # Update weights. Since the prior is used as the proposal,the weights
                 # are updated additively as the ratio of the new and old log_likelihoods
                 for p_idx, p in enumerate(particles):
-                    new_likelihood = self.likelihood_logp(p.tree.predict_output(num_observations))
+                    new_likelihood = self.likelihood_logp(inv_link(p.tree.predict_output()))
                     p.log_weight += new_likelihood - p.old_likelihood_logp
                     p.old_likelihood_logp = new_likelihood
 
                 # Normalize weights
                 W, normalized_weights = self.normalize(particles)
-
                 # Resample all but first particle
                 re_n_w = normalized_weights[1:] / normalized_weights[1:].sum()
                 new_indices = np.random.choice(self.indices, size=len(self.indices), p=re_n_w)
@@ -148,8 +149,8 @@ def astep(self, _):
             new_tree = np.random.choice(particles, p=normalized_weights)
             self.old_trees_particles_list[tree.tree_id] = new_tree
             bart.trees[idx] = new_tree.tree
-            new_prediction = new_tree.tree.predict_output(num_observations)
-            bart.sum_trees_output = bart.Y - R_j + new_prediction
+            new_prediction = new_tree.tree.predict_output()
+            bart.sum_trees_output += new_prediction
 
             if not self.tune:
                 self.iter += 1
@@ -161,8 +162,7 @@ def astep(self, _):
                     variable_inclusion[index] += 1
 
         stats = {"variable_inclusion": variable_inclusion}
-
-        return bart.sum_trees_output, [stats]
+        return inv_link(bart.sum_trees_output), [stats]
 
     @staticmethod
     def competence(var, has_grad):
@@ -194,31 +194,26 @@ def get_old_tree_particle(self, tree_id, t):
         old_tree_particle.set_particle_to_step(t)
         return old_tree_particle
 
-    def init_particles(self, tree_id, R_j, num_observations):
+    def init_particles(self, tree_id, num_observations, inv_link):
         """
         Initialize particles
         """
         # The first particle is from the tree we are trying to replace
         prev_tree = self.get_old_tree_particle(tree_id, 0)
-        likelihood = self.likelihood_logp(prev_tree.tree.predict_output(num_observations))
+        likelihood = self.likelihood_logp(inv_link(prev_tree.tree.predict_output()))
         prev_tree.old_likelihood_logp = likelihood
         prev_tree.log_weight = likelihood - self.log_num_particles
         particles = [prev_tree]
 
         # The rest of the particles are identically initialized
-        initial_value_leaf_nodes = R_j.mean()
         initial_idx_data_points_leaf_nodes = np.arange(num_observations, dtype="int32")
         new_tree = Tree.init_tree(
             tree_id=tree_id,
-            leaf_node_value=initial_value_leaf_nodes,
+            leaf_node_value=0,
             idx_data_points=initial_idx_data_points_leaf_nodes,
         )
-        likelihood_logp = self.likelihood_logp(new_tree.predict_output(num_observations))
-        log_weight = likelihood_logp - self.log_num_particles
         for i in range(1, self.num_particles):
-            particles.append(
-                ParticleTree(new_tree, self.bart.prior_prob_leaf_node, log_weight, likelihood_logp)
-            )
+            particles.append(ParticleTree(new_tree, self.bart.prior_prob_leaf_node, 0, 0))
 
         return np.array(particles)
 
@@ -237,10 +232,10 @@ class ParticleTree:
 
     def __init__(self, tree, prior_prob_leaf_node, log_weight=0, likelihood=0):
         self.tree = tree.copy()  # keeps the tree that we care at the moment
-        self.expansion_nodes = tree.idx_leaf_nodes.copy()  # This should be the array [0]
+        self.expansion_nodes = [0]
         self.tree_history = [self.tree]
         self.expansion_nodes_history = [self.expansion_nodes]
-        self.log_weight = 0
+        self.log_weight = log_weight
         self.prior_prob_leaf_node = prior_prob_leaf_node
         self.old_likelihood_logp = likelihood
         self.used_variates = []
@@ -253,7 +248,8 @@ def sample_tree_sequential(self, bart):
 
             if prob_leaf < np.random.random():
                 grow_successful, index_selected_predictor = bart.grow_tree(
-                    self.tree, index_leaf_node
+                    self.tree,
+                    index_leaf_node,
                 )
                 if grow_successful:
                     # Add new leaf nodes indexes
diff --git a/pymc3/tests/test_bart.py b/pymc3/tests/test_bart.py