Skip to content

Commit e22e3af

Browse files
Merge pull request #8 from Quantmetry/fix_rpca
regressor patched
2 parents 6affb53 + 0fbceac commit e22e3af

File tree

3 files changed

+34
-28
lines changed

3 files changed

+34
-28
lines changed

examples/benchmark.md

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -126,16 +126,14 @@ imputer_residuals = imputers.ImputerResiduals(groups=["station"], period=7, mode
126126
imputer_rpca = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=365, max_iter=200, tau=2, lam=.3)
127127
imputer_rpca_opti = imputers.ImputerRPCA(groups=["station"], columnwise=True, period=365, max_iter=100)
128128

129-
imputer_ou = imputers.ImputeEM(groups=["station"], method="multinormal", max_iter_em=34, n_iter_ou=15, strategy="ou")
130-
imputer_tsou = imputers.ImputeEM(groups=["station"], method="VAR1", strategy="ou", max_iter_em=34, n_iter_ou=15)
131-
imputer_tsmle = imputers.ImputeEM(groups=["station"], method="VAR1", strategy="mle", max_iter_em=34, n_iter_ou=15)
129+
imputer_ou = imputers.ImputerEM(groups=["station"], method="multinormal", max_iter_em=34, n_iter_ou=15, strategy="ou")
130+
imputer_tsou = imputers.ImputerEM(groups=["station"], method="VAR1", strategy="ou", max_iter_em=34, n_iter_ou=15)
131+
imputer_tsmle = imputers.ImputerEM(groups=["station"], method="VAR1", strategy="mle", max_iter_em=34, n_iter_ou=15)
132132

133133

134134
imputer_knn = imputers.ImputerKNN(groups=["station"], k=10)
135135
imputer_iterative = imputers.ImputerMICE(groups=["station"], estimator=LinearRegression(), sample_posterior=False, max_iter=100, missing_values=np.nan)
136-
impute_regressor = imputers.ImputerRegressor(
137-
HistGradientBoostingRegressor(), cols_to_impute=cols_to_impute
138-
)
136+
impute_regressor = imputers.ImputerRegressor(LinearRegression, groups=["station"])
139137
impute_stochastic_regressor = imputers.ImputerStochasticRegressor(
140138
HistGradientBoostingRegressor(), cols_to_impute=cols_to_impute
141139
)
@@ -151,12 +149,13 @@ dict_imputers = {
151149
"OU": imputer_ou,
152150
"TSOU": imputer_tsou,
153151
"TSMLE": imputer_tsmle,
154-
"RPCA": imputer_rpca,
152+
# "RPCA": imputer_rpca,
155153
# "RPCA_opti": imputer_rpca_opti,
156154
# "locf": imputer_locf,
157155
# "nocb": imputer_nocb,
158156
# "knn": imputer_knn,
159-
# "iterative": imputer_iterative,
157+
"iterative": impute_regressor,
158+
"regressor": imputer_iterative,
160159
}
161160
n_imputers = len(dict_imputers)
162161

@@ -183,7 +182,7 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
183182
Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
184183

185184
```python tags=[]
186-
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
185+
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=10, groups=["station"], ratio_masked=ratio_masked)
187186

188187
comparison = comparator.Comparator(
189188
dict_imputers,
-871 Bytes
Loading

qolmat/imputations/imputers.py

Lines changed: 26 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import sys
2-
from typing import Dict, List, Optional, Union
2+
from typing import Any, Dict, List, Optional, Union
33

44
import sklearn.neighbors._base
55

@@ -112,7 +112,7 @@ class ImputerMean(Imputer):
112112

113113
def __init__(
114114
self,
115-
groups: Optional[List[str]] = [],
115+
groups: List[str] = [],
116116
) -> None:
117117
super().__init__(groups=groups, columnwise=True)
118118
self.fit_transform_element = pd.DataFrame.mean
@@ -137,7 +137,7 @@ class ImputerMedian(Imputer):
137137

138138
def __init__(
139139
self,
140-
groups: Optional[List[str]] = [],
140+
groups: List[str] = [],
141141
) -> None:
142142
super().__init__(groups=groups, columnwise=True)
143143
self.fit_transform_element = pd.DataFrame.median
@@ -162,7 +162,7 @@ class ImputerMode(Imputer):
162162

163163
def __init__(
164164
self,
165-
groups: Optional[List[str]] = [],
165+
groups: List[str] = [],
166166
) -> None:
167167
super().__init__(groups=groups, columnwise=True)
168168
self.fit_transform_element = lambda df: df.mode().iloc[0]
@@ -187,7 +187,7 @@ class ImputerShuffle(Imputer):
187187

188188
def __init__(
189189
self,
190-
groups: Optional[List[str]] = [],
190+
groups: List[str] = [],
191191
) -> None:
192192
super().__init__(groups=groups, columnwise=True)
193193

@@ -225,7 +225,7 @@ class ImputerLOCF(Imputer):
225225

226226
def __init__(
227227
self,
228-
groups: Optional[List[str]] = [],
228+
groups: List[str] = [],
229229
) -> None:
230230
super().__init__(groups=groups, columnwise=True)
231231

@@ -259,7 +259,7 @@ class ImputerNOCB(Imputer):
259259

260260
def __init__(
261261
self,
262-
groups: Optional[List[str]] = [],
262+
groups: List[str] = [],
263263
) -> None:
264264
super().__init__(groups=groups, columnwise=True)
265265

@@ -305,7 +305,7 @@ class ImputerInterpolation(Imputer):
305305

306306
def __init__(
307307
self,
308-
groups: Optional[List[str]] = [],
308+
groups: List[str] = [],
309309
method: str = "linear",
310310
order: int = None,
311311
col_time: Optional[str] = None,
@@ -373,7 +373,7 @@ class ImputerResiduals(Imputer):
373373

374374
def __init__(
375375
self,
376-
groups: Optional[List[str]] = [],
376+
groups: List[str] = [],
377377
period: int = None,
378378
model_tsa: Optional[str] = "additive",
379379
extrapolate_trend: Optional[Union[int, str]] = "freq",
@@ -437,11 +437,12 @@ class ImputerKNN(Imputer):
437437

438438
def __init__(
439439
self,
440+
groups: List[str] = [],
440441
n_neighbors: int = 5,
441442
weights: str = "distance",
442443
**hyperparams,
443444
) -> None:
444-
super().__init__(columnwise=False, hyperparams=hyperparams)
445+
super().__init__(groups=groups, columnwise=False, hyperparams=hyperparams)
445446
self.n_neighbors = n_neighbors
446447
self.weights = weights
447448

@@ -562,8 +563,10 @@ class ImputerRegressor(Imputer):
562563
>>> imputor.fit_transform(df)
563564
"""
564565

565-
def __init__(self, type_model, fit_on_nan: bool = False, **hyperparams):
566-
super().__init__(hyperparams=hyperparams)
566+
def __init__(
567+
self, type_model: Any, groups: List[str] = [], fit_on_nan: bool = False, **hyperparams
568+
):
569+
super().__init__(groups=groups, hyperparams=hyperparams)
567570
self.columnwise = False
568571
self.type_model = type_model
569572
self.fit_on_nan = fit_on_nan
@@ -590,7 +593,7 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
590593

591594
for col in cols_with_nans:
592595
hyperparams = {}
593-
for hyperparam, value in self.hyperparams.items():
596+
for hyperparam, value in self.hyperparams_element.items():
594597
if isinstance(value, dict):
595598
value = value[col]
596599
hyperparams[hyperparam] = value
@@ -600,11 +603,15 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
600603
if self.fit_on_nan:
601604
X = df.drop(columns=col)
602605
else:
603-
X = df[cols_without_nans].drop(columns=col)
606+
X = df[cols_without_nans].drop(columns=col, errors="ignore")
604607
y = df[col]
605608
is_na = y.isna()
606-
model.fit(X[~is_na], y[~is_na])
607-
df_imputed.loc[is_na, col] = model.predict(X[is_na])
609+
if X.empty:
610+
y_imputed = pd.Series(y.mean(), index=y.index)
611+
else:
612+
model.fit(X[~is_na], y[~is_na])
613+
y_imputed = model.predict(X[is_na])
614+
df_imputed.loc[is_na, col] = y_imputed
608615

609616
return df_imputed
610617

@@ -633,8 +640,8 @@ class ImputerStochasticRegressor(Imputer):
633640
>>> imputor.fit_transform(df)
634641
"""
635642

636-
def __init__(self, type_model, **hyperparams) -> None:
637-
super().__init__(hyperparams=hyperparams)
643+
def __init__(self, type_model: str, groups: List[str] = [], **hyperparams) -> None:
644+
super().__init__(groups=groups, hyperparams=hyperparams)
638645
self.type_model = type_model
639646

640647
def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
@@ -728,7 +735,7 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
728735
return df_imputed
729736

730737

731-
class ImputeEM(Imputer):
738+
class ImputerEM(Imputer):
732739
def __init__(
733740
self,
734741
groups: List[str] = [],

0 commit comments

Comments
 (0)