pymc-devs
diff --git a/‎pymc_experimental/model/modular/components.py‎
Lines changed: 24 additions & 49 deletions b/‎pymc_experimental/model/modular/components.py‎
Lines changed: 24 additions & 49 deletions
diff --git a/‎pymc_experimental/model/modular/likelihood.py‎
Lines changed: 2 additions & 22 deletions b/‎pymc_experimental/model/modular/likelihood.py‎
Lines changed: 2 additions & 22 deletions
@@ -1,44 +1,18 @@
 from abc import ABC, abstractmethod
-from typing import Literal, get_args
 
 import pandas as pd
 import pymc as pm
 
 from model.modular.utilities import (
+    PRIOR_DEFAULT_KWARGS,
     ColumnType,
+    PoolingType,
     get_X_data,
-    hierarchical_prior_to_requested_depth,
+    make_hierarchical_prior,
     select_data_columns,
 )
 from patsy import dmatrix
 
-PoolingType = Literal["none", "complete", "partial", None]
-valid_pooling = get_args(PoolingType)
-
-
-def _validate_pooling_params(pooling_columns: ColumnType, pooling: PoolingType):
-    """
-    Helper function to validate inputs to a GLM component.
-
-    Parameters
-    ----------
-    pooling_columns: str or list of str
-        Data columns used to construct a hierarchical prior
-    pooling: str
-        Type of pooling to use in the component
-
-    Returns
-    -------
-    None
-    """
-    if pooling_columns is not None and pooling == "complete":
-        raise ValueError("Index data provided but complete pooling was requested")
-    if pooling_columns is None and pooling != "complete":
-        raise ValueError(
-            "Index data must be provided for partial pooling (pooling = 'partial') or no pooling "
-            "(pooling = 'none')"
-        )
-
 
 class GLMModel(ABC):
     """Base class for GLM components. Subclasses should implement the build method to construct the component."""
@@ -91,7 +65,7 @@ def __init__(
         self,
         name: str | None = None,
         *,
-        pooling_cols: ColumnType = None,
+        pooling_columns: ColumnType = None,
         pooling: PoolingType = "complete",
         hierarchical_params: dict | None = None,
         prior: str = "Normal",
@@ -108,7 +82,7 @@ def __init__(
         ----------
         name: str, optional
             Name of the intercept term. If None, a default name is generated based on the index_data.
-        pooling_cols: str or list of str, optional
+        pooling_columns: str or list of str, optional
             Columns of the independent data to use as labels for pooling. These columns will be treated as categorical.
             If None, no pooling is applied. If a list is provided, a "telescoping" hierarchy is constructed from left
             to right, with the mean of each subsequent level centered on the mean of the previous level.
@@ -133,37 +107,41 @@ def __init__(
             Additional keyword arguments to pass to the PyMC distribution specified by the prior argument.
 
         """
-        _validate_pooling_params(pooling_cols, pooling)
-
-        self.pooling_cols = pooling_cols
         self.hierarchical_params = hierarchical_params if hierarchical_params is not None else {}
-        self.pooling = pooling if pooling_cols is not None else "complete"
+        self.pooling = pooling
 
         self.prior = prior
         self.prior_params = prior_params if prior_params is not None else {}
 
-        if pooling_cols is None:
-            pooling_cols = []
-        elif isinstance(pooling_cols, str):
-            pooling_cols = [pooling_cols]
+        if pooling_columns is None:
+            pooling_columns = []
+        elif isinstance(pooling_columns, str):
+            pooling_columns = [pooling_columns]
 
-        name = name or f"Intercept(pooling_cols={pooling_cols})"
+        self.pooling_columns = pooling_columns
+        name = name or f"Intercept(pooling_cols={pooling_columns})"
 
         super().__init__(name=name)
 
     def build(self, model: pm.Model | None = None):
         model = pm.modelcontext(model)
         with model:
             if self.pooling == "complete":
-                intercept = getattr(pm, self.prior.title())(f"{self.name}", **self.prior_params)
+                prior_params = PRIOR_DEFAULT_KWARGS[self.prior].copy()
+                prior_params.update(self.prior_params)
+
+                intercept = getattr(pm, self.prior)(f"{self.name}", **prior_params)
                 return intercept
 
-            intercept = hierarchical_prior_to_requested_depth(
+            intercept = make_hierarchical_prior(
                 self.name,
-                df=get_X_data(model)[self.pooling_cols],
+                X=get_X_data(model),
                 model=model,
+                pooling_columns=self.pooling_columns,
                 dims=None,
-                no_pooling=self.pooling == "none",
+                pooling=self.pooling,
+                prior=self.prior,
+                prior_kwargs=self.prior_params,
                 **self.hierarchical_params,
             )
 
@@ -219,8 +197,6 @@ def __init__(
         prior_params:
             Additional keyword arguments to pass to the PyMC distribution specified by the prior argument.
         """
-        _validate_pooling_params(pooling_columns, pooling)
-
         self.feature_columns = feature_columns
         self.pooling = pooling
         self.pooling_columns = pooling_columns
@@ -248,7 +224,7 @@ def build(self, model=None):
                 )
                 return X @ beta
 
-            beta = hierarchical_prior_to_requested_depth(
+            beta = make_hierarchical_prior(
                 self.name,
                 self.index_data,
                 model=model,
@@ -318,7 +294,6 @@ def __init__(
                 offset_dist: str, one of ["zerosum", "normal", "laplace"]
                     Name of the distribution to use for the offset distribution. Default is "zerosum"
         """
-        _validate_pooling_params(index_data, pooling)
         self.name = name if name else f"Spline({feature_column})"
         self.feature_column = feature_column
         self.n_knots = n_knots
@@ -352,7 +327,7 @@ def build(self, model: pm.Model | None = None):
 
             elif self.pooling_columns is not None:
                 X = select_data_columns(self.pooling_columns, model)
-                beta = hierarchical_prior_to_requested_depth(
+                beta = make_hierarchical_prior(
                     name=self.name,
                     X=X,
                     model=model,
 
@@ -3,7 +3,6 @@
 from typing import Literal, get_args
 
 import arviz as az
-import numpy as np
 import pandas as pd
 import pymc as pm
 import pytensor.tensor as pt
@@ -14,7 +13,7 @@
 from pytensor.tensor.random.type import RandomType
 
 from pymc_experimental.model.marginal.marginal_model import MarginalModel
-from pymc_experimental.model.modular.utilities import ColumnType
+from pymc_experimental.model.modular.utilities import ColumnType, encode_categoricals
 
 LIKELIHOOD_TYPES = Literal["lognormal", "logt", "mixture", "unmarginalized-mixture"]
 valid_likelihoods = get_args(LIKELIHOOD_TYPES)
@@ -42,33 +41,14 @@ def __init__(self, target_col: ColumnType, data: pd.DataFrame):
             [target_col] = target_col
         self.target_col = target_col
 
-        # TODO: Reconsider this (two sources of nearly the same info not good)
         X_df = data.drop(columns=[target_col])
 
         self.obs_dim = data.index.name
         self.coords = {
             self.obs_dim: data.index.values,
         }
 
-        for col, dtype in X_df.dtypes.to_dict().items():
-            if dtype.name.startswith("float"):
-                pass
-            elif dtype.name == "object":
-                # TODO: We definitely need to save these if we want to factorize predict data
-                col_array, labels = pd.factorize(X_df[col], sort=True)
-                X_df[col] = col_array.astype("float64")
-                self.coords[col] = labels
-            elif dtype.name.startswith("int"):
-                _data = X_df[col].copy()
-                X_df[col] = X_df[col].astype("float64")
-                assert np.all(
-                    _data == X_df[col].astype("int")
-                ), "Information was lost in conversion to float"
-
-            else:
-                raise NotImplementedError(
-                    f"Haven't decided how to handle the following type: {dtype.name}"
-                )
+        X_df, self.coords = encode_categoricals(X_df, self.coords)
 
         numeric_cols = [
             col for col, dtype in X_df.dtypes.to_dict().items() if dtype.name.startswith("float")