Update intercept docstring

jessegrabowski · jessegrabowski · commit 7b3cdcf32825 · 2024-12-10T06:34:03.000-06:00
diff --git a/pymc_experimental/model/modular/components.py b/pymc_experimental/model/modular/components.py
@@ -6,7 +6,7 @@
 import pymc as pm
 import pytensor.tensor as pt
 
-from model.modular.utilities import ColumnType, hierarchical_prior_to_requested_depth
+from model.modular.utilities import ColumnType, get_X_data, hierarchical_prior_to_requested_depth
 from patsy import dmatrix
 
 POOLING_TYPES = Literal["none", "complete", "partial"]
@@ -105,7 +105,6 @@ def __init__(
         prior_params: dict | None = None,
     ):
         """
-        TODO: Update signature docs
         Class to represent an intercept term in a GLM model.
 
         By intercept, it is meant any constant term in the model that is not a function of any input data. This can be
@@ -116,21 +115,15 @@ def __init__(
         ----------
         name: str, optional
             Name of the intercept term. If None, a default name is generated based on the index_data.
-        index_data: Series or DataFrame, optional
-            Index data used to build hierarchical priors. If there are multiple columns, the columns are treated as
-            levels of a "telescoping" hierarchy, with the leftmost column representing the top level of the hierarchy,
-            and depth increasing to the right.
-
-            The index of the index_data must match the index of the observed data.
-        prior: str, optional
-            Name of the PyMC distribution to use for the intercept term. Default is "Normal".
+        pooling_cols: str or list of str, optional
+            Columns of the independent data to use as labels for pooling. These columns will be treated as categorical.
+            If None, no pooling is applied. If a list is provided, a "telescoping" hierarchy is constructed from left
+            to right, with the mean of each subsequent level centered on the mean of the previous level.
         pooling: str, one of ["none", "complete", "partial"], default "complete"
             Type of pooling to use for the intercept term. If "none", no pooling is applied, and each group in the
             index_data is treated as independent. If "complete", complete pooling is applied, and all data are treated
             as coming from the same group. If "partial", a hierarchical prior is constructed that shares information
             across groups in the index_data.
-        prior_params: dict, optional
-            Additional keyword arguments to pass to the PyMC distribution specified by the prior argument.
         hierarchical_params: dict, optional
             Additional keyword arguments to configure priors in the hierarchical_prior_to_requested_depth function.
             Options include:
@@ -141,6 +134,11 @@ def __init__(
                     Default is {"alpha": 2, "beta": 1}
                 offset_dist: str, one of ["zerosum", "normal", "laplace"]
                     Name of the distribution to use for the offset distribution. Default is "zerosum"
+        prior: str, optional
+            Name of the PyMC distribution to use for the intercept term. Default is "Normal".
+        prior_params: dict, optional
+            Additional keyword arguments to pass to the PyMC distribution specified by the prior argument.
+
         """
         _validate_pooling_params(pooling_cols, pooling)
 
@@ -158,25 +156,25 @@ def __init__(
 
         data_name = ", ".join(pooling_cols)
         self.name = name or f"Constant(pooling_cols={data_name})"
+
         super().__init__()
 
-    def build(self, model=None):
+    def build(self, model: pm.Model | None = None):
         model = pm.modelcontext(model)
         with model:
             if self.pooling == "complete":
                 intercept = getattr(pm, self.prior)(f"{self.name}", **self.prior_params)
                 return intercept
 
-            [i for i, col in enumerate(model.coords["feature"]) if col in self.pooling_cols]
-
             intercept = hierarchical_prior_to_requested_depth(
                 self.name,
-                model.X_df[self.pooling_cols],  # TODO: Reconsider this
+                df=get_X_data(model)[self.pooling_cols],
                 model=model,
                 dims=None,
                 no_pooling=self.pooling == "none",
                 **self.hierarchical_params,
             )
+
         return intercept
 
 
diff --git a/pymc_experimental/model/modular/likelihood.py b/pymc_experimental/model/modular/likelihood.py
@@ -43,18 +43,18 @@ def __init__(self, target_col: ColumnType, data: pd.DataFrame):
 
         # TODO: Reconsider this (two sources of nearly the same info not good)
         X_df = data.drop(columns=[target_col])
-        X_data = X_df.copy()
+
         self.column_labels = {}
-        for col, dtype in X_data.dtypes.to_dict().items():
+        for col, dtype in X_df.dtypes.to_dict().items():
             if dtype.name.startswith("float"):
                 pass
             elif dtype.name == "object":
                 # TODO: We definitely need to save these if we want to factorize predict data
-                col_array, labels = pd.factorize(X_data[col], sort=True)
-                X_data[col] = col_array.astype("float64")
+                col_array, labels = pd.factorize(X_df[col], sort=True)
+                X_df[col] = col_array.astype("float64")
                 self.column_labels[col] = {label: i for i, label in enumerate(labels.values)}
             elif dtype.name.startswith("int"):
-                X_data[col] = X_data[col].astype("float64")
+                X_df[col] = X_df[col].astype("float64")
             else:
                 raise NotImplementedError(
                     f"Haven't decided how to handle the following type: {dtype.name}"
@@ -63,14 +63,13 @@ def __init__(self, target_col: ColumnType, data: pd.DataFrame):
         self.obs_dim = data.index.name
         coords = {
             self.obs_dim: data.index.values,
-            "feature": list(X_data.columns),
+            "feature": list(X_df.columns),
         }
         with self._get_model_class(coords) as self.model:
-            self.model.X_df = X_df  # FIXME: Definitely not a solution
             pm.Data(f"{target_col}_observed", data[target_col], dims=self.obs_dim)
             pm.Data(
                 "X_data",
-                X_data,
+                X_df,
                 dims=(self.obs_dim, "feature"),
                 shape=(None, len(coords["feature"])),
             )
diff --git a/pymc_experimental/model/modular/utilities.py b/pymc_experimental/model/modular/utilities.py
@@ -37,6 +37,10 @@ def _get_x_cols(
     return model["X_data"][:, cols_idx]
 
 
+def get_X_data(model, data_name="X_data"):
+    return model[data_name]
+
+
 def make_level_maps(df: pd.DataFrame, ordered_levels: list[str]):
     """
     For each row of data, create a mapping between levels of a arbitrary set of levels defined by `ordered_levels`.