Skip to content

Commit a07c82f

Browse files
Julien RousselJulien Roussel
authored andcommitted
estimator based models debugged
1 parent 0fbceac commit a07c82f

File tree

4 files changed

+87
-39
lines changed

4 files changed

+87
-39
lines changed

examples/benchmark.md

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,11 @@ This dataset only contains numerical vairables.
6464

6565
```python
6666
df_data = data.get_data_corrupted("Beijing", ratio_masked=.2, mean_size=120)
67+
df_data["cat"] = [i % 3 for i in range(len(df_data))]
6768

6869
# cols_to_impute = ["TEMP", "PRES", "DEWP", "NO2", "CO", "O3", "WSPM"]
6970
# cols_to_impute = df_data.columns[df_data.isna().any()]
70-
cols_to_impute = ["TEMP", "PRES"]
71+
cols_to_impute = ["TEMP", "PRES", "cat"]
7172

7273
```
7374

@@ -112,9 +113,13 @@ All presented methods are group-wise: here each station is imputed independently
112113
Some methods require hyperparameters. The user can directly specify them, or rather determine them through an optimization step using the `search_params` dictionary. The keys are the imputation method's name and the values are a dictionary specifying the minimum, maximum or list of categories and type of values (Integer, Real, Category or a dictionary indexed by the variable names) to search.
113114
In pratice, we rely on a cross validation to find the best hyperparams values minimizing an error reconstruction.
114115

116+
```python tags=[]
117+
hasattr(imputers.ImputerMean(), "groups")
118+
```
119+
115120
```python
116121
imputer_mean = imputers.ImputerMean(groups=["station"])
117-
imputer_median = imputers.ImputerMedian(groups=["station"])
122+
imputer_median = imputers.ImputerMedian(groups=["station", "cat"])
118123
imputer_mode = imputers.ImputerMode(groups=["station"])
119124
imputer_locf = imputers.ImputerLOCF(groups=["station"])
120125
imputer_nocb = imputers.ImputerNOCB(groups=["station"])
@@ -133,14 +138,12 @@ imputer_tsmle = imputers.ImputerEM(groups=["station"], method="VAR1", strategy="
133138

134139
imputer_knn = imputers.ImputerKNN(groups=["station"], k=10)
135140
imputer_iterative = imputers.ImputerMICE(groups=["station"], estimator=LinearRegression(), sample_posterior=False, max_iter=100, missing_values=np.nan)
136-
impute_regressor = imputers.ImputerRegressor(LinearRegression, groups=["station"])
137-
impute_stochastic_regressor = imputers.ImputerStochasticRegressor(
138-
HistGradientBoostingRegressor(), cols_to_impute=cols_to_impute
139-
)
141+
impute_regressor = imputers.ImputerRegressor(groups=["station"], estimator=LinearRegression())
142+
impute_stochastic_regressor = imputers.ImputerStochasticRegressor(groups=["station"], estimator=LinearRegression())
140143

141144
dict_imputers = {
142145
"mean": imputer_mean,
143-
# "median": imputer_median,
146+
"median": imputer_median,
144147
# "mode": imputer_mode,
145148
"interpolation": imputer_interpol,
146149
# "spline": imputer_spline,
@@ -182,7 +185,7 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
182185
Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
183186

184187
```python tags=[]
185-
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=10, groups=["station"], ratio_masked=ratio_masked)
188+
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
186189

187190
comparison = comparator.Comparator(
188191
dict_imputers,
175 KB
Loading

qolmat/imputations/imputers.py

Lines changed: 49 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
import abc
2+
import copy
13
import sys
24
from typing import Any, Dict, List, Optional, Union
35

46
import sklearn.neighbors._base
7+
from sklearn.base import BaseEstimator
58

69
sys.modules["sklearn.neighbors.base"] = sklearn.neighbors._base
710

@@ -20,12 +23,19 @@
2023

2124

2225
class Imputer(_BaseImputer):
23-
def __init__(self, groups: List[str] = [], columnwise: bool = False, hyperparams: Dict = {}):
26+
def __init__(
27+
self,
28+
groups: List[str] = [],
29+
columnwise: bool = False,
30+
shrink: bool = False,
31+
hyperparams: Dict = {},
32+
):
2433
self.hyperparams_user = hyperparams
2534
self.hyperparams_optim = {}
2635
self.hyperparams_local = {}
2736
self.groups = groups
2837
self.columnwise = columnwise
38+
self.shrink = shrink
2939

3040
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
3141
"""
@@ -47,6 +57,12 @@ def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
4757
hyperparams = self.hyperparams_user.copy()
4858
hyperparams.update(self.hyperparams_optim)
4959
cols_with_nans = df.columns[df.isna().any()]
60+
61+
if self.groups == []:
62+
self.ngroups = pd.Series(0, index=df.index).rename("_ngroup")
63+
else:
64+
self.ngroups = df.groupby(self.groups).ngroup().rename("_ngroup")
65+
5066
if self.columnwise:
5167

5268
# imputed = pd.DataFrame(index=df.index, columns=df.columns)
@@ -79,8 +95,12 @@ def impute_element(self, df: pd.DataFrame) -> pd.DataFrame:
7995
df = df.copy()
8096
if self.groups:
8197

82-
groupby = utils.custom_groupby(df, self.groups)
83-
imputation_values = groupby.apply(self.fit_transform_element)
98+
# groupby = utils.custom_groupby(df, self.groups)
99+
groupby = df.groupby(self.ngroups, group_keys=False)
100+
if self.shrink:
101+
imputation_values = groupby.transform(self.fit_transform_element)
102+
else:
103+
imputation_values = groupby.apply(self.fit_transform_element)
84104
else:
85105
imputation_values = self.fit_transform_element(df)
86106

@@ -114,7 +134,7 @@ def __init__(
114134
self,
115135
groups: List[str] = [],
116136
) -> None:
117-
super().__init__(groups=groups, columnwise=True)
137+
super().__init__(groups=groups, columnwise=True, shrink=True)
118138
self.fit_transform_element = pd.DataFrame.mean
119139

120140

@@ -139,7 +159,7 @@ def __init__(
139159
self,
140160
groups: List[str] = [],
141161
) -> None:
142-
super().__init__(groups=groups, columnwise=True)
162+
super().__init__(groups=groups, columnwise=True, shrink=True)
143163
self.fit_transform_element = pd.DataFrame.median
144164

145165

@@ -164,7 +184,7 @@ def __init__(
164184
self,
165185
groups: List[str] = [],
166186
) -> None:
167-
super().__init__(groups=groups, columnwise=True)
187+
super().__init__(groups=groups, columnwise=True, shrink=True)
168188
self.fit_transform_element = lambda df: df.mode().iloc[0]
169189

170190

@@ -509,9 +529,11 @@ class ImputerMICE(Imputer):
509529
def __init__(
510530
self,
511531
groups: List[str] = [],
532+
estimator: Optional[BaseEstimator] = None,
512533
**hyperparams,
513534
) -> None:
514535
super().__init__(groups=groups, columnwise=False, hyperparams=hyperparams)
536+
self.estimator = estimator
515537

516538
def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
517539
"""
@@ -530,7 +552,7 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
530552
if not isinstance(df, pd.DataFrame):
531553
raise ValueError("Input has to be a pandas.DataFrame.")
532554

533-
iterative_imputer = IterativeImputer(**self.hyperparams_element)
555+
iterative_imputer = IterativeImputer(estimator=self.estimator, **self.hyperparams_element)
534556
res = iterative_imputer.fit_transform(df.values)
535557
imputed = pd.DataFrame(columns=df.columns)
536558
for ind, col in enumerate(imputed.columns):
@@ -564,11 +586,15 @@ class ImputerRegressor(Imputer):
564586
"""
565587

566588
def __init__(
567-
self, type_model: Any, groups: List[str] = [], fit_on_nan: bool = False, **hyperparams
589+
self,
590+
groups: List[str] = [],
591+
estimator: Optional[BaseEstimator] = None,
592+
fit_on_nan: bool = False,
593+
**hyperparams,
568594
):
569595
super().__init__(groups=groups, hyperparams=hyperparams)
570596
self.columnwise = False
571-
self.type_model = type_model
597+
self.estimator = estimator
572598
self.fit_on_nan = fit_on_nan
573599

574600
def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -598,7 +624,9 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
598624
value = value[col]
599625
hyperparams[hyperparam] = value
600626

601-
model = self.type_model(**hyperparams)
627+
# model = copy.deepcopy(self.estimator)
628+
# for hyperparam, value in hyperparams.items():
629+
# setattr(model, hyperparam, value)
602630

603631
if self.fit_on_nan:
604632
X = df.drop(columns=col)
@@ -609,8 +637,8 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
609637
if X.empty:
610638
y_imputed = pd.Series(y.mean(), index=y.index)
611639
else:
612-
model.fit(X[~is_na], y[~is_na])
613-
y_imputed = model.predict(X[is_na])
640+
self.estimator.fit(X[~is_na], y[~is_na])
641+
y_imputed = self.estimator.predict(X[is_na])
614642
df_imputed.loc[is_na, col] = y_imputed
615643

616644
return df_imputed
@@ -632,17 +660,19 @@ class ImputerStochasticRegressor(Imputer):
632660
>>> import pandas as pd
633661
>>> from qolmat.imputations.models import ImputeStochasticRegressor
634662
>>> from sklearn.ensemble import ExtraTreesRegressor
635-
>>> imputor = ImputeStochasticRegressor(model=ExtraTreesRegressor())
663+
>>> imputer = ImputeStochasticRegressor(estimator=ExtraTreesRegressor)
636664
>>> df = pd.DataFrame(data=[[1, 1, 1, 1],
637665
>>> [np.nan, np.nan, 2, 3],
638666
>>> [1, 2, 2, 5], [2, 2, 2, 2]],
639667
>>> columns=["var1", "var2", "var3", "var4"])
640-
>>> imputor.fit_transform(df)
668+
>>> imputer.fit_transform(df)
641669
"""
642670

643-
def __init__(self, type_model: str, groups: List[str] = [], **hyperparams) -> None:
671+
def __init__(
672+
self, groups: List[str] = [], estimator: Optional[BaseEstimator] = None, **hyperparams
673+
) -> None:
644674
super().__init__(groups=groups, hyperparams=hyperparams)
645-
self.type_model = type_model
675+
self.estimator = estimator
646676

647677
def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
648678
"""
@@ -659,7 +689,6 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
659689
imputed dataframe
660690
"""
661691
df_imp = df.copy()
662-
model = self.type_model(**self.hyperparams)
663692
cols_with_nans = df.columns[df.isna().any()]
664693
cols_without_nans = df.columns[df.notna().all()]
665694

@@ -670,8 +699,8 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
670699
X = df[cols_without_nans]
671700
y = df[col]
672701
is_na = y.isna()
673-
model.fit(X[~is_na], y[~is_na])
674-
y_pred = model.predict(X)
702+
self.estimator.fit(X[~is_na], y[~is_na])
703+
y_pred = self.estimator.predict(X)
675704
std_error = (y_pred[~is_na] - y[~is_na]).std()
676705
random_pred = np.random.normal(size=len(y), loc=y_pred, scale=std_error)
677706
df_imp.loc[is_na, col] = random_pred[is_na]
@@ -696,8 +725,8 @@ class ImputerRPCA(Imputer):
696725

697726
def __init__(
698727
self,
699-
method: str = "noisy",
700728
groups: List[str] = [],
729+
method: str = "noisy",
701730
columnwise: bool = False,
702731
**hyperparams,
703732
) -> None:

qolmat/utils/data.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def get_data(name_data="Beijing", datapath: str = "data/", download: Optional[bo
8282

8383

8484
def preprocess_data(df: pd.DataFrame):
85-
"""Put data into dataframe
85+
"""Preprocess data from the "Beijing" datset
8686
8787
Parameters
8888
----------
@@ -106,14 +106,14 @@ def preprocess_data(df: pd.DataFrame):
106106
return df
107107

108108

109-
def add_holes(X: pd.DataFrame, ratio_masked: float, mean_size: int):
109+
def add_holes(df: pd.DataFrame, ratio_masked: float, mean_size: int):
110110
"""
111-
Creates holes in a dataset with no missing value. Only used in the documentation to design
111+
Creates holes in a dataset with no missing value, starting from `df`. Only used in the documentation to design
112112
examples.
113113
114114
Parameters
115115
----------
116-
X : pd.DataFrame
116+
df : pd.DataFrame
117117
dataframe no missing values
118118
119119
mean_size : int
@@ -130,18 +130,18 @@ def add_holes(X: pd.DataFrame, ratio_masked: float, mean_size: int):
130130
pd.DataFrame
131131
dataframe with missing values
132132
"""
133-
groups = X.index.names.difference(["datetime", "date", "index"])
133+
groups = df.index.names.difference(["datetime", "date", "index"])
134134
generator = missing_patterns.GeometricHoleGenerator(
135-
1, ratio_masked=ratio_masked, subset=X.columns, groups=groups
135+
1, ratio_masked=ratio_masked, subset=df.columns, groups=groups
136136
)
137137

138-
generator.dict_probas_out = {column: 1 / mean_size for column in X.columns}
139-
generator.dict_ratios = {column: 1 / len(X.columns) for column in X.columns}
138+
generator.dict_probas_out = {column: 1 / mean_size for column in df.columns}
139+
generator.dict_ratios = {column: 1 / len(df.columns) for column in df.columns}
140140
if generator.groups == []:
141-
mask = generator.generate_mask(X)
141+
mask = generator.generate_mask(df)
142142
else:
143-
mask = X.groupby(groups, group_keys=False).apply(generator.generate_mask)
144-
X_with_nans = X.copy()
143+
mask = df.groupby(groups, group_keys=False).apply(generator.generate_mask)
144+
X_with_nans = df.copy()
145145
X_with_nans[mask] = np.nan
146146
return X_with_nans
147147

@@ -151,6 +151,22 @@ def get_data_corrupted(
151151
mean_size: int = 90,
152152
ratio_masked: float = 0.2,
153153
):
154+
"""
155+
Returns a dataframe with controled corruption optained from the source `name_data`
156+
157+
Parameters
158+
----------
159+
name_data : str
160+
Name of the data source, can be "Beijing" or "Artificial"
161+
mean_size: int
162+
Mean size of the holes to be generated using a geometric law
163+
ratio_masked: float
164+
Percent of missing data in each column in the output dataframe
165+
Returns
166+
-------
167+
pd.DataFrame
168+
Dataframe with missing values
169+
"""
154170
df = get_data(name_data)
155171
df = add_holes(df, mean_size=mean_size, ratio_masked=ratio_masked)
156172
return df

0 commit comments

Comments
 (0)