Skip to content

Commit bb68399

Browse files
model_builder adaptations for full sklearn compatibility
adapting linearmodel and it's tests to use sklearn-only approach
1 parent c3fd654 commit bb68399

File tree

10 files changed

+204
-135
lines changed

10 files changed

+204
-135
lines changed

conda-envs/environment-test.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,12 @@ channels:
44
- defaults
55
dependencies:
66
- pip
7+
78
- pytest-cov>=2.5
89
- pytest>=3.0
910
- dask
1011
- xhistogram
1112
- pip:
1213
- pymc>=5.4.1 # CI was failing to resolve
1314
- blackjax
15+
- scikit-learn

conda-envs/windows-environment-test.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,3 +10,4 @@ dependencies:
1010
- xhistogram
1111
- pip:
1212
- pymc>=5.4.1 # CI was failing to resolve
13+
- scikit-learn

pymc_experimental/linearmodel.py

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,17 @@
1+
from typing import Dict, Optional, Union
2+
13
import numpy as np
4+
import pandas as pd
25
import pymc as pm
36

47
from pymc_experimental.model_builder import ModelBuilder
58

69

710
class LinearModel(ModelBuilder):
11+
def __init__(self, model_config: Dict = None, sampler_config: Dict = None, nsamples=100):
12+
self.nsamples = nsamples
13+
super().__init__(model_config, sampler_config)
14+
815
"""
916
This class is an implementation of a single-input linear regression model in PYMC using the
1017
BayesianEstimator base class for interoperability with scikit-learn.
@@ -30,7 +37,11 @@ def default_sampler_config(self):
3037
"target_accept": 0.95,
3138
}
3239

33-
def build_model(self, data=None):
40+
@property
41+
def output_var(self):
42+
return "y_hat"
43+
44+
def build_model(self, X: pd.DataFrame, y: pd.Series):
3445
"""
3546
Build the PyMC model.
3647
@@ -78,16 +89,16 @@ def build_model(self, data=None):
7889
observed=y_data,
7990
dims="observation",
8091
)
81-
self.output_var = "y_hat"
8292

83-
def _data_setter(self, X, y=None):
93+
def _data_setter(self, X: pd.DataFrame, y: Optional[Union[pd.DataFrame, pd.Series]] = None):
8494
with self.model:
85-
pm.set_data({"x": X[:, 0]})
95+
pm.set_data({"x": X.squeeze()})
8696
if y is not None:
8797
pm.set_data({"y_data": y.squeeze()})
8898

89-
@classmethod
90-
def generate_model_data(cls, nsamples=100, data=None):
99+
def generate_and_preprocess_model_data(
100+
self, X: Union[pd.DataFrame, pd.Series], y: pd.Series
101+
) -> None:
91102
"""
92103
Generate model data for linear regression.
93104
@@ -112,9 +123,4 @@ def generate_model_data(cls, nsamples=100, data=None):
112123
>>> assert x.shape == (100, 1)
113124
>>> assert y.shape == (100,)
114125
"""
115-
x = np.linspace(start=0, stop=1, num=nsamples)
116-
y = 5 * x + 3
117-
y = y + np.random.normal(0, 1, len(x))
118-
119-
x = np.expand_dims(x, -1) # scikit assumes a dimension for features.
120-
return x, y
126+
self.X, self.y = X, y

pymc_experimental/model_builder.py

Lines changed: 66 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ class ModelBuilder:
5050

5151
def __init__(
5252
self,
53-
data: Union[np.ndarray, pd.DataFrame, pd.Series] = None,
5453
model_config: Dict = None,
5554
sampler_config: Dict = None,
5655
):
@@ -77,10 +76,8 @@ def __init__(
7776

7877
self.model_config = model_config # parameters for priors etc.
7978
self.model = None # Set by build_model
80-
self.output_var = "" # Set by build_model
8179
self.idata: Optional[az.InferenceData] = None # idata is generated during fitting
8280
self.is_fitted_ = False
83-
self.data = data
8481

8582
def _validate_data(self, X, y=None):
8683
if y is not None:
@@ -122,6 +119,19 @@ def _data_setter(
122119

123120
raise NotImplementedError
124121

122+
@property
123+
@abstractmethod
124+
def output_var(self):
125+
"""
126+
Returns the name of the output variable of the model.
127+
128+
Returns
129+
-------
130+
output_var : str
131+
Name of the output variable of the model.
132+
"""
133+
raise NotImplementedError
134+
125135
@property
126136
@abstractmethod
127137
def default_model_config(self) -> Dict:
@@ -176,39 +186,41 @@ def default_sampler_config(self) -> Dict:
176186
raise NotImplementedError
177187

178188
@abstractmethod
179-
def generate_model_data(
180-
self, data: Union[np.ndarray, pd.DataFrame, pd.Series] = None
181-
) -> pd.DataFrame:
189+
def generate_and_preprocess_model_data(
190+
self, X: Union[pd.DataFrame, pd.Series], y: pd.Series
191+
) -> None:
182192
"""
183-
Returns a default dataset for a class, can be used as a hint to data formatting required for the class
184-
If data is not None, dataset will be created from it's content.
193+
Applies preprocessing to the data before fitting the model.
194+
if validate is True, it will check if the data is valid for the model.
195+
sets self.model_coords based on provided dataset
185196
186197
Parameters:
187-
data : Union[np.ndarray, pd.DataFrame, pd.Series], optional
188-
dataset that will replace the default sample data
189-
198+
X : array, shape (n_obs, n_features)
199+
y : array, shape (n_obs,)
190200
191201
Examples
192202
--------
193203
>>> @classmethod
194-
>>> def generate_model_data(self):
204+
>>> def generate_and_preprocess_model_data(self, X, y):
195205
>>> x = np.linspace(start=1, stop=50, num=100)
196206
>>> y = 5 * x + 3 + np.random.normal(0, 1, len(x)) * np.random.rand(100)*10 + np.random.rand(100)*6.4
197-
>>> data = pd.DataFrame({'input': x, 'output': y})
207+
>>> X = pd.DataFrame(x, columns=['x'])
208+
>>> y = pd.Series(y, name='y')
209+
>>> self.X = X
210+
>>> self.y = y
198211
199212
Returns
200213
-------
201-
data : pd.DataFrame
202-
The data we want to train the model on.
214+
None
203215
204216
"""
205217
raise NotImplementedError
206218

207219
@abstractmethod
208220
def build_model(
209221
self,
210-
data: Union[np.ndarray, pd.DataFrame, pd.Series] = None,
211-
model_config: Dict = None,
222+
X: pd.DataFrame,
223+
y: pd.Series,
212224
**kwargs,
213225
) -> None:
214226
"""
@@ -217,22 +229,31 @@ def build_model(
217229
218230
Parameters
219231
----------
220-
data : dict
221-
Preformated data that is going to be used in the model. For efficiency reasons it should contain only the necesary data columns,
222-
not entire available dataset since it's going to be encoded into data used to recreate the model.
223-
If not provided uses data from self.data
224-
model_config : dict
225-
Dictionary where keys are strings representing names of parameters of the model, values are dictionaries of parameters
226-
needed for creating model parameters. If not provided uses data from self.model_config
232+
X : pd.DataFrame
233+
The input data that is going to be used in the model. This should be a DataFrame
234+
containing the features (predictors) for the model. For efficiency reasons, it should
235+
only contain the necessary data columns, not the entire available dataset, as this
236+
will be encoded into the data used to recreate the model.
237+
238+
y : pd.Series
239+
The target data for the model. This should be a Series representing the output
240+
or dependent variable for the model.
241+
242+
kwargs : dict
243+
Additional keyword arguments that may be used for model configuration.
227244
228245
See Also
229246
--------
230247
default_model_config : returns default model config
231248
232-
Returns:
233-
----------
249+
Returns
250+
-------
234251
None
235252
253+
Raises
254+
------
255+
NotImplementedError
256+
This is an abstract method and must be implemented in a subclass.
236257
"""
237258
raise NotImplementedError
238259

@@ -248,7 +269,7 @@ def sample_model(self, **kwargs):
248269
Returns
249270
-------
250271
xarray.Dataset
251-
The PyMC3 samples dataset.
272+
The PyMC samples dataset.
252273
253274
Raises
254275
------
@@ -383,12 +404,14 @@ def load(cls, fname: str):
383404
filepath = Path(str(fname))
384405
idata = az.from_netcdf(filepath)
385406
model = cls(
386-
data=idata.fit_data.to_dataframe(),
387407
model_config=json.loads(idata.attrs["model_config"]),
388408
sampler_config=json.loads(idata.attrs["sampler_config"]),
389409
)
390410
model.idata = idata
391-
model.build_model()
411+
dataset = idata.fit_data.to_dataframe()
412+
X = dataset.drop(columns=[model.output_var])
413+
y = dataset[model.output_var]
414+
model.build_model(X, y)
392415
# All previously used data is in idata.
393416

394417
if model.id != idata.attrs["id"]:
@@ -400,8 +423,8 @@ def load(cls, fname: str):
400423

401424
def fit(
402425
self,
403-
X: Union[np.ndarray, pd.DataFrame, pd.Series],
404-
y: Union[np.ndarray, pd.Series],
426+
X: pd.DataFrame,
427+
y: pd.Series,
405428
progressbar: bool = True,
406429
predictor_names: List[str] = None,
407430
random_seed: RandomState = None,
@@ -442,25 +465,19 @@ def fit(
442465
if predictor_names is None:
443466
predictor_names = []
444467

445-
X, y = X, y
446-
447-
self.build_model(data=self.data)
448-
self._data_setter(X, y)
468+
y = pd.DataFrame({self.output_var: y})
469+
self.generate_and_preprocess_model_data(X, y.values.flatten())
470+
self.build_model(self.X, self.y)
449471

450472
sampler_config = self.sampler_config.copy()
451473
sampler_config["progressbar"] = progressbar
452474
sampler_config["random_seed"] = random_seed
453475
sampler_config.update(**kwargs)
454-
455476
self.idata = self.sample_model(**sampler_config)
456-
if type(X) is np.ndarray:
457-
if len(predictor_names) > 0:
458-
X = pd.DataFrame(X, columns=predictor_names)
459-
else:
460-
X = pd.DataFrame(X, columns=[f"predictor{x}" for x in range(1, X.shape[1] + 1)])
461-
if type(y) is np.ndarray:
462-
y = pd.Series(y, name="target")
463-
combined_data = pd.concat([X, y], axis=1)
477+
478+
X_df = pd.DataFrame(X, columns=X.columns)
479+
combined_data = pd.concat([X_df, y], axis=1)
480+
assert all(combined_data.columns), "All columns must have non-empty names"
464481
self.idata.add_groups(fit_data=combined_data.to_xarray()) # type: ignore
465482
return self.idata # type: ignore
466483

@@ -513,6 +530,7 @@ def predict(
513530
def sample_prior_predictive(
514531
self,
515532
X_pred,
533+
y_pred=None,
516534
samples: Optional[int] = None,
517535
extend_idata: bool = False,
518536
combined: bool = True,
@@ -539,13 +557,15 @@ def sample_prior_predictive(
539557
prior_predictive_samples : DataArray, shape (n_pred, samples)
540558
Prior predictive samples for each input X_pred
541559
"""
560+
if y_pred is None:
561+
y_pred = np.zeros(len(X_pred))
542562
if samples is None:
543563
samples = self.sampler_config.get("draws", 500)
544564

545565
if self.model is None:
546-
self.build_model()
566+
self.build_model(X_pred, y_pred)
547567

548-
self._data_setter(X_pred)
568+
self._data_setter(X_pred, y_pred)
549569
if self.model is not None:
550570
with self.model: # sample with new input data
551571
prior_pred: az.InferenceData = pm.sample_prior_predictive(samples, **kwargs)

pymc_experimental/preprocessing/__init__.py

Whitespace-only changes.
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
import pandas as pd
2+
from sklearn.base import BaseEstimator, TransformerMixin
3+
from sklearn.preprocessing import StandardScaler
4+
5+
6+
class StandardScalerDF(StandardScaler, TransformerMixin, BaseEstimator):
7+
def __init__(self, with_mean=True, with_std=True):
8+
super().__init__(with_mean=with_mean, with_std=with_std)
9+
10+
def transform(self, X, y=None):
11+
z = super().transform(X)
12+
return pd.DataFrame(z, index=X.index, columns=X.columns)
13+
14+
def fit_transform(self, X, y=None):
15+
z = super().fit_transform(X)
16+
return pd.DataFrame(z, index=X.index, columns=X.columns)

0 commit comments

Comments
 (0)