pymc-devs
diff --git a/‎conda-envs/environment-test.yml‎
Lines changed: 2 additions & 0 deletions b/‎conda-envs/environment-test.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎conda-envs/windows-environment-test.yml‎
Lines changed: 1 addition & 0 deletions b/‎conda-envs/windows-environment-test.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pymc_experimental/linearmodel.py‎
Lines changed: 18 additions & 12 deletions b/‎pymc_experimental/linearmodel.py‎
Lines changed: 18 additions & 12 deletions
diff --git a/‎pymc_experimental/model_builder.py‎
Lines changed: 66 additions & 46 deletions b/‎pymc_experimental/model_builder.py‎
Lines changed: 66 additions & 46 deletions
diff --git a/‎pymc_experimental/preprocessing/__init__.py‎ b/‎pymc_experimental/preprocessing/__init__.py‎
diff --git a/‎pymc_experimental/preprocessing/standard_scaler.py‎
Lines changed: 16 additions & 0 deletions b/‎pymc_experimental/preprocessing/standard_scaler.py‎
Lines changed: 16 additions & 0 deletions
@@ -4,10 +4,12 @@ channels:
 - defaults
 dependencies:
 - pip
+
 - pytest-cov>=2.5
 - pytest>=3.0
 - dask
 - xhistogram
 - pip:
   - pymc>=5.4.1  # CI was failing to resolve
   - blackjax
+  - scikit-learn
@@ -10,3 +10,4 @@ dependencies:
 - xhistogram
 - pip:
   - pymc>=5.4.1  # CI was failing to resolve
+  - scikit-learn
@@ -1,10 +1,17 @@
+from typing import Dict, Optional, Union
+
 import numpy as np
+import pandas as pd
 import pymc as pm
 
 from pymc_experimental.model_builder import ModelBuilder
 
 
 class LinearModel(ModelBuilder):
+    def __init__(self, model_config: Dict = None, sampler_config: Dict = None, nsamples=100):
+        self.nsamples = nsamples
+        super().__init__(model_config, sampler_config)
+
     """
     This class is an implementation of a single-input linear regression model in PYMC using the
     BayesianEstimator base class for interoperability with scikit-learn.
@@ -30,7 +37,11 @@ def default_sampler_config(self):
             "target_accept": 0.95,
         }
 
-    def build_model(self, data=None):
+    @property
+    def output_var(self):
+        return "y_hat"
+
+    def build_model(self, X: pd.DataFrame, y: pd.Series):
         """
         Build the PyMC model.
 
@@ -78,16 +89,16 @@ def build_model(self, data=None):
                 observed=y_data,
                 dims="observation",
             )
-            self.output_var = "y_hat"
 
-    def _data_setter(self, X, y=None):
+    def _data_setter(self, X: pd.DataFrame, y: Optional[Union[pd.DataFrame, pd.Series]] = None):
         with self.model:
-            pm.set_data({"x": X[:, 0]})
+            pm.set_data({"x": X.squeeze()})
             if y is not None:
                 pm.set_data({"y_data": y.squeeze()})
 
-    @classmethod
-    def generate_model_data(cls, nsamples=100, data=None):
+    def generate_and_preprocess_model_data(
+        self, X: Union[pd.DataFrame, pd.Series], y: pd.Series
+    ) -> None:
         """
         Generate model data for linear regression.
 
@@ -112,9 +123,4 @@ def generate_model_data(cls, nsamples=100, data=None):
         >>> assert x.shape == (100, 1)
         >>> assert y.shape == (100,)
         """
-        x = np.linspace(start=0, stop=1, num=nsamples)
-        y = 5 * x + 3
-        y = y + np.random.normal(0, 1, len(x))
-
-        x = np.expand_dims(x, -1)  # scikit assumes a dimension for features.
-        return x, y
+        self.X, self.y = X, y
@@ -50,7 +50,6 @@ class ModelBuilder:
 
     def __init__(
         self,
-        data: Union[np.ndarray, pd.DataFrame, pd.Series] = None,
         model_config: Dict = None,
         sampler_config: Dict = None,
     ):
@@ -77,10 +76,8 @@ def __init__(
 
         self.model_config = model_config  # parameters for priors etc.
         self.model = None  # Set by build_model
-        self.output_var = ""  # Set by build_model
         self.idata: Optional[az.InferenceData] = None  # idata is generated during fitting
         self.is_fitted_ = False
-        self.data = data
 
     def _validate_data(self, X, y=None):
         if y is not None:
@@ -122,6 +119,19 @@ def _data_setter(
 
         raise NotImplementedError
 
+    @property
+    @abstractmethod
+    def output_var(self):
+        """
+        Returns the name of the output variable of the model.
+
+        Returns
+        -------
+        output_var : str
+            Name of the output variable of the model.
+        """
+        raise NotImplementedError
+
     @property
     @abstractmethod
     def default_model_config(self) -> Dict:
@@ -176,39 +186,41 @@ def default_sampler_config(self) -> Dict:
         raise NotImplementedError
 
     @abstractmethod
-    def generate_model_data(
-        self, data: Union[np.ndarray, pd.DataFrame, pd.Series] = None
-    ) -> pd.DataFrame:
+    def generate_and_preprocess_model_data(
+        self, X: Union[pd.DataFrame, pd.Series], y: pd.Series
+    ) -> None:
         """
-        Returns a default dataset for a class, can be used as a hint to data formatting required for the class
-        If data is not None, dataset will be created from it's content.
+        Applies preprocessing to the data before fitting the model.
+        if validate is True, it will check if the data is valid for the model.
+        sets self.model_coords based on provided dataset
 
         Parameters:
-        data : Union[np.ndarray, pd.DataFrame, pd.Series], optional
-            dataset that will replace the default sample data
-
+        X : array, shape (n_obs, n_features)
+        y : array, shape (n_obs,)
 
         Examples
         --------
         >>>     @classmethod
-        >>>     def generate_model_data(self):
+        >>>     def generate_and_preprocess_model_data(self, X, y):
         >>>         x = np.linspace(start=1, stop=50, num=100)
         >>>         y = 5 * x + 3 + np.random.normal(0, 1, len(x)) * np.random.rand(100)*10 +  np.random.rand(100)*6.4
-        >>>         data = pd.DataFrame({'input': x, 'output': y})
+        >>>         X = pd.DataFrame(x, columns=['x'])
+        >>>         y = pd.Series(y, name='y')
+        >>>         self.X = X
+        >>>         self.y = y
 
         Returns
         -------
-        data : pd.DataFrame
-            The data we want to train the model on.
+        None
 
         """
         raise NotImplementedError
 
     @abstractmethod
     def build_model(
         self,
-        data: Union[np.ndarray, pd.DataFrame, pd.Series] = None,
-        model_config: Dict = None,
+        X: pd.DataFrame,
+        y: pd.Series,
         **kwargs,
     ) -> None:
         """
@@ -217,22 +229,31 @@ def build_model(
 
         Parameters
         ----------
-        data : dict
-            Preformated data that is going to be used in the model. For efficiency reasons it should contain only the necesary data columns,
-            not entire available dataset since it's going to be encoded into data used to recreate the model.
-            If not provided uses data from self.data
-        model_config : dict
-            Dictionary where keys are strings representing names of parameters of the model, values are dictionaries of parameters
-            needed for creating model parameters. If not provided uses data from self.model_config
+        X : pd.DataFrame
+            The input data that is going to be used in the model. This should be a DataFrame
+            containing the features (predictors) for the model. For efficiency reasons, it should
+            only contain the necessary data columns, not the entire available dataset, as this
+            will be encoded into the data used to recreate the model.
+
+        y : pd.Series
+            The target data for the model. This should be a Series representing the output
+            or dependent variable for the model.
+
+        kwargs : dict
+            Additional keyword arguments that may be used for model configuration.
 
         See Also
         --------
         default_model_config : returns default model config
 
-        Returns:
-        ----------
+        Returns
+        -------
         None
 
+        Raises
+        ------
+        NotImplementedError
+            This is an abstract method and must be implemented in a subclass.
         """
         raise NotImplementedError
 
@@ -248,7 +269,7 @@ def sample_model(self, **kwargs):
         Returns
         -------
         xarray.Dataset
-            The PyMC3 samples dataset.
+            The PyMC samples dataset.
 
         Raises
         ------
@@ -383,12 +404,14 @@ def load(cls, fname: str):
         filepath = Path(str(fname))
         idata = az.from_netcdf(filepath)
         model = cls(
-            data=idata.fit_data.to_dataframe(),
             model_config=json.loads(idata.attrs["model_config"]),
             sampler_config=json.loads(idata.attrs["sampler_config"]),
         )
         model.idata = idata
-        model.build_model()
+        dataset = idata.fit_data.to_dataframe()
+        X = dataset.drop(columns=[model.output_var])
+        y = dataset[model.output_var]
+        model.build_model(X, y)
         # All previously used data is in idata.
 
         if model.id != idata.attrs["id"]:
@@ -400,8 +423,8 @@ def load(cls, fname: str):
 
     def fit(
         self,
-        X: Union[np.ndarray, pd.DataFrame, pd.Series],
-        y: Union[np.ndarray, pd.Series],
+        X: pd.DataFrame,
+        y: pd.Series,
         progressbar: bool = True,
         predictor_names: List[str] = None,
         random_seed: RandomState = None,
@@ -442,25 +465,19 @@ def fit(
         if predictor_names is None:
             predictor_names = []
 
-        X, y = X, y
-
-        self.build_model(data=self.data)
-        self._data_setter(X, y)
+        y = pd.DataFrame({self.output_var: y})
+        self.generate_and_preprocess_model_data(X, y.values.flatten())
+        self.build_model(self.X, self.y)
 
         sampler_config = self.sampler_config.copy()
         sampler_config["progressbar"] = progressbar
         sampler_config["random_seed"] = random_seed
         sampler_config.update(**kwargs)
-
         self.idata = self.sample_model(**sampler_config)
-        if type(X) is np.ndarray:
-            if len(predictor_names) > 0:
-                X = pd.DataFrame(X, columns=predictor_names)
-            else:
-                X = pd.DataFrame(X, columns=[f"predictor{x}" for x in range(1, X.shape[1] + 1)])
-        if type(y) is np.ndarray:
-            y = pd.Series(y, name="target")
-        combined_data = pd.concat([X, y], axis=1)
+
+        X_df = pd.DataFrame(X, columns=X.columns)
+        combined_data = pd.concat([X_df, y], axis=1)
+        assert all(combined_data.columns), "All columns must have non-empty names"
         self.idata.add_groups(fit_data=combined_data.to_xarray())  # type: ignore
         return self.idata  # type: ignore
 
@@ -513,6 +530,7 @@ def predict(
     def sample_prior_predictive(
         self,
         X_pred,
+        y_pred=None,
         samples: Optional[int] = None,
         extend_idata: bool = False,
         combined: bool = True,
@@ -539,13 +557,15 @@ def sample_prior_predictive(
         prior_predictive_samples : DataArray, shape (n_pred, samples)
             Prior predictive samples for each input X_pred
         """
+        if y_pred is None:
+            y_pred = np.zeros(len(X_pred))
         if samples is None:
             samples = self.sampler_config.get("draws", 500)
 
         if self.model is None:
-            self.build_model()
+            self.build_model(X_pred, y_pred)
 
-        self._data_setter(X_pred)
+        self._data_setter(X_pred, y_pred)
         if self.model is not None:
             with self.model:  # sample with new input data
                 prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)
 
@@ -0,0 +1,16 @@
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import StandardScaler
+
+
+class StandardScalerDF(StandardScaler, TransformerMixin, BaseEstimator):
+    def __init__(self, with_mean=True, with_std=True):
+        super().__init__(with_mean=with_mean, with_std=with_std)
+
+    def transform(self, X, y=None):
+        z = super().transform(X)
+        return pd.DataFrame(z, index=X.index, columns=X.columns)
+
+    def fit_transform(self, X, y=None):
+        z = super().fit_transform(X)
+        return pd.DataFrame(z, index=X.index, columns=X.columns)