Skip to content

Commit 3dab35a

Browse files
author
vm-aifluence-jro
committed
Merge branch 'dev' into doc_readme
2 parents 1bb8b7f + d5c42b1 commit 3dab35a

File tree

3 files changed

+102
-31
lines changed

3 files changed

+102
-31
lines changed

qolmat/imputations/imputers.py

Lines changed: 71 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import warnings
22
from typing import Dict, List, Optional, Union
3+
from abc import abstractmethod
34

45
import numpy as np
56
import pandas as pd
@@ -41,19 +42,38 @@ class Imputer(_BaseImputer):
4142

4243
def __init__(
4344
self,
44-
groups: List[str] = [],
4545
columnwise: bool = False,
4646
shrink: bool = False,
47-
hyperparams: Dict = {},
4847
random_state: Union[None, int, np.random.RandomState] = None,
48+
missing_values=np.nan,
49+
groups: List[str] = [],
50+
hyperparams: Dict = {},
4951
):
50-
self.hyperparams_user = hyperparams
51-
self.groups = groups
5252
self.columnwise = columnwise
5353
self.shrink = shrink
5454
self.random_state = random_state
55-
56-
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
55+
self.missing_values = missing_values
56+
self.groups = groups
57+
self.hyperparams = hyperparams
58+
59+
def _more_tags(self):
60+
"""Define tags for scikit-learn"""
61+
62+
return {
63+
"allow_nan": True,
64+
"requires_fit": False,
65+
"_xfail_checks": {
66+
"check_parameters_default_constructible": "The imputer need Dict as a parammeter",
67+
"check_no_attributes_set_in_init": """The imputer can define an attribute
68+
modifiable in init""",
69+
},
70+
}
71+
72+
def fit(self, X, y: pd.DataFrame = None):
73+
X = self._validate_data(X, force_all_finite="allow-nan")
74+
return self
75+
76+
def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
5777
"""
5878
Returns a dataframe with same shape as `df`, unchanged values, where all nans are replaced
5979
by non-nan values.
@@ -70,24 +90,27 @@ def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
7090
pd.DataFrame
7191
Imputed dataframe.
7292
"""
73-
if not isinstance(df, pd.DataFrame):
74-
raise ValueError("Input has to be a pandas.DataFrame.")
93+
self.fit(X)
94+
95+
if not isinstance(X, (pd.DataFrame, np.ndarray)):
96+
raise ValueError("Input has to be a pandas.DataFrame or numpy.ndarray.")
97+
df = pd.DataFrame(X)
7598
for column in df:
7699
if df[column].isnull().all():
77100
raise ValueError("Input contains a column full of NaN")
78101
self.rng = sku.check_random_state(self.random_state)
79102
if hasattr(self, "estimator") and hasattr(self.estimator, "random_state"):
80103
self.estimator.random_state = self.rng
81104

82-
hyperparams = self.hyperparams_user.copy()
105+
hyperparams = self.hyperparams.copy()
83106
if hasattr(self, "hyperparams_optim"):
84107
hyperparams.update(self.hyperparams_optim)
85108
cols_with_nans = df.columns[df.isna().any()]
86109

87110
if self.groups == []:
88-
self.ngroups = pd.Series(0, index=df.index).rename("_ngroup")
111+
self.ngroups_ = pd.Series(0, index=df.index).rename("_ngroup")
89112
else:
90-
self.ngroups = df.groupby(self.groups).ngroup().rename("_ngroup")
113+
self.ngroups_ = df.groupby(self.groups).ngroup().rename("_ngroup")
91114

92115
if self.columnwise:
93116
df_imputed = df.copy()
@@ -156,8 +179,8 @@ def impute_element(self, df: pd.DataFrame) -> pd.DataFrame:
156179
raise ValueError("Input has to be a pandas.DataFrame.")
157180
df = df.copy()
158181
if self.groups:
159-
# groupby = utils.custom_groupby(df, self.groups)
160-
groupby = df.groupby(self.ngroups, group_keys=False)
182+
# groupby = utils.custom_groupby(df, groups)
183+
groupby = df.groupby(self.ngroups_, group_keys=False)
161184
if self.shrink:
162185
imputation_values = groupby.transform(self.fit_transform_element)
163186
else:
@@ -173,6 +196,10 @@ def impute_element(self, df: pd.DataFrame) -> pd.DataFrame:
173196

174197
return df
175198

199+
@abstractmethod
200+
def fit_transform_element(self, df: pd.DataFrame):
201+
return df
202+
176203

177204
class ImputerOracle(Imputer):
178205
"""
@@ -195,7 +222,7 @@ def __init__(
195222
super().__init__()
196223
self.df = df
197224

198-
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
225+
def fit_transform(self, X: pd.DataFrame, y=None) -> pd.DataFrame:
199226
"""Impute df with corresponding known values
200227
201228
Parameters
@@ -207,8 +234,10 @@ def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
207234
pd.DataFrame
208235
dataframe imputed with premasked values
209236
"""
210-
if not isinstance(df, pd.DataFrame):
211-
raise ValueError("Input has to be a pandas.DataFrame.")
237+
self.fit(X)
238+
if not isinstance(X, (pd.DataFrame, np.ndarray)):
239+
raise ValueError("Input has to be a pandas.DataFrame or numpy.ndarray.")
240+
df = pd.DataFrame(X)
212241
return df.fillna(self.df)
213242

214243

@@ -244,7 +273,12 @@ def __init__(
244273
groups: List[str] = [],
245274
) -> None:
246275
super().__init__(groups=groups, columnwise=True, shrink=True)
247-
self.fit_transform_element = pd.DataFrame.mean
276+
277+
def _more_tags(self):
278+
return {"allow_nan": True, "requires_fit": False}
279+
280+
def fit_transform_element(self, df: pd.DataFrame):
281+
return pd.DataFrame.mean(df)
248282

249283

250284
class ImputerMedian(Imputer):
@@ -279,7 +313,9 @@ def __init__(
279313
groups: List[str] = [],
280314
) -> None:
281315
super().__init__(groups=groups, columnwise=True, shrink=True)
282-
self.fit_transform_element = pd.DataFrame.median
316+
317+
def fit_transform_element(self, df: pd.DataFrame):
318+
return pd.DataFrame.median(df)
283319

284320

285321
class ImputerMode(Imputer):
@@ -314,7 +350,9 @@ def __init__(
314350
groups: List[str] = [],
315351
) -> None:
316352
super().__init__(groups=groups, columnwise=True, shrink=True)
317-
self.fit_transform_element = lambda df: df.mode().iloc[0]
353+
354+
def fit_transform_element(self, df: pd.DataFrame):
355+
return df.mode().iloc[0]
318356

319357

320358
class ImputerShuffle(Imputer):
@@ -647,6 +685,7 @@ def __init__(
647685
super().__init__(groups=groups, columnwise=False, hyperparams=hyperparams)
648686
self.n_neighbors = n_neighbors
649687
self.weights = weights
688+
self.hyperparams_optim: Dict = {}
650689

651690
def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
652691
imputer = KNNImputer(
@@ -663,7 +702,8 @@ class ImputerMICE(Imputer):
663702
This class implements an iterative imputer in the multivariate case.
664703
It imputes each Series within a DataFrame multiple times using an iteration of fits
665704
and transformations to reach a stable state of imputation each time.
666-
It uses sklearn.impute.IterativeImputer, see the docs for more information about the arguments.
705+
It uses sklearn.impute.IterativeImputer, see the docs for more information about the
706+
arguments.
667707
668708
Parameters
669709
----------
@@ -711,6 +751,7 @@ def __init__(
711751
random_state=random_state,
712752
)
713753
self.estimator = estimator
754+
self.hyperparams_optim: Dict = {}
714755

715756
def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
716757
iterative_imputer = IterativeImputer(estimator=self.estimator, **self.hyperparams_element)
@@ -769,6 +810,7 @@ def __init__(
769810
self.columnwise = False
770811
self.estimator = estimator
771812
self.handler_nan = handler_nan
813+
self.hyperparams_optim: Dict = {}
772814

773815
def get_params_fit(self) -> Dict:
774816
return {}
@@ -842,8 +884,8 @@ class ImputerRPCA(Imputer):
842884
"""
843885
This class implements the Robust Principal Component Analysis imputation.
844886
845-
The imputation minimizes a loss function combining a low-rank criterium on the dataframe and a
846-
L1 penalization on the residuals.
887+
The imputation minimizes a loss function combining a low-rank criterium on the dataframe and
888+
a L1 penalization on the residuals.
847889
848890
Parameters
849891
----------
@@ -852,10 +894,11 @@ class ImputerRPCA(Imputer):
852894
method : str
853895
Name of the RPCA method:
854896
"PCP" for basic RPCA, bad at imputing
855-
"noisy" for noisy RPCA, with possible regularisations, wihch is recommended since it is
856-
more stable
897+
"noisy" for noisy RPCA, with possible regularisations, wihch is recommended since
898+
it is more stable
857899
columnwise : bool
858-
For the RPCA method to be applied columnwise (with reshaping of each column into an array)
900+
For the RPCA method to be applied columnwise (with reshaping of
901+
each column into an array)
859902
or to be applied directly on the dataframe. By default, the value is set to False.
860903
"""
861904

@@ -875,6 +918,7 @@ def __init__(
875918
)
876919

877920
self.method = method
921+
self.hyperparams_optim: Dict = {}
878922

879923
def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
880924
if not isinstance(df, pd.DataFrame):
@@ -890,7 +934,7 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
890934
X = df.values.T
891935
M, A = model.decompose_rpca_signal(X)
892936
df_imputed = pd.DataFrame((M + A).T, index=df.index, columns=df.columns)
893-
df_imputed = df.where(df.isna(), df_imputed)
937+
df_imputed = df.where(~df.isna(), df_imputed)
894938

895939
return df_imputed
896940

@@ -933,6 +977,7 @@ def __init__(
933977
random_state=random_state,
934978
)
935979
self.model = model
980+
self.hyperparams_optim: Dict = {}
936981

937982
def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
938983
if self.model == "multinormal":

tests/benchmark/test_comparator.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,8 @@
3535
index_tuples_expected = pd.MultiIndex.from_product(
3636
[["mae", "wmape", "KL_columnwise"], ["col1", "col2"]]
3737
)
38-
data_expected = [3.0, 0.5, 0.75, 0.5, 37.88948, 39.68123]
38+
# data_expected = [3.0, 0.5, 0.75, 0.5, 37.88948, 39.68123]
39+
data_expected = [4.467175, 7.467187, 1.116794, 7.467187, 37.491336, 36.977574]
3940
result_expected = pd.Series(data_expected, index=index_tuples_expected)
4041

4142

@@ -70,4 +71,4 @@ def test_benchmark_comparator_compare(df1: pd.DataFrame, imputer: str) -> None:
7071
else:
7172
result = comparison.compare(df_origin)
7273
result_expected_DataFrame = pd.DataFrame(result_expected)
73-
np.testing.assert_allclose(result, result_expected_DataFrame, atol=1e-5)
74+
np.testing.assert_allclose(result, result_expected_DataFrame, atol=1e-3)

tests/imputations/test_imputers.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pandas as pd
55
import pytest
66
from sklearn.ensemble import ExtraTreesRegressor
7+
from sklearn.utils.estimator_checks import parametrize_with_checks
78

89
from qolmat.imputations import imputers
910

@@ -145,7 +146,7 @@ def test_ImputerInterpolation_fit_transform(df: pd.DataFrame) -> None:
145146

146147
@pytest.mark.parametrize("df", [df_timeseries])
147148
def test_ImputerResiduals_fit_transform(df: pd.DataFrame) -> None:
148-
imputer = imputers.ImputerResiduals(7)
149+
imputer = imputers.ImputerResiduals(period=7)
149150
result = imputer.fit_transform(df)
150151
expected = pd.DataFrame(
151152
{
@@ -191,7 +192,7 @@ def test_ImputerMICE_fit_transform(df: pd.DataFrame) -> None:
191192

192193
@pytest.mark.parametrize("df", [df_incomplete])
193194
def test_ImputerRegressor_fit_transform(df: pd.DataFrame) -> None:
194-
imputer = imputers.ImputerRegressor(model=ExtraTreesRegressor())
195+
imputer = imputers.ImputerRegressor(estimator=ExtraTreesRegressor())
195196
result = imputer.fit_transform(df)
196197
expected = pd.DataFrame(
197198
{
@@ -209,7 +210,7 @@ def test_ImputerRPCA_fit_transform(df: pd.DataFrame) -> None:
209210
expected = pd.DataFrame(
210211
{
211212
"col1": [i for i in range(20)],
212-
"col2": [0, 10.5, 2, 10.5, 2] + [i for i in range(5, 20)],
213+
"col2": [0, 25.375562, 2, 29.396932, 2] + [i for i in range(5, 20)],
213214
}
214215
)
215216
np.testing.assert_allclose(result, expected)
@@ -229,3 +230,27 @@ def test_ImputerEM_fit_transform(df: pd.DataFrame) -> None:
229230
}
230231
)
231232
np.testing.assert_allclose(result, expected, atol=1e-6)
233+
234+
235+
@parametrize_with_checks(
236+
[
237+
imputers.Imputer(),
238+
imputers.ImputerOracle(df_complete),
239+
imputers.ImputerMean(),
240+
imputers.ImputerMedian(),
241+
imputers.ImputerMode(),
242+
imputers.ImputerShuffle(),
243+
imputers.ImputerLOCF(),
244+
imputers.ImputerNOCB(),
245+
imputers.ImputerInterpolation(),
246+
imputers.ImputerResiduals(period=7),
247+
imputers.KNNImputer(),
248+
imputers.ImputerMICE(),
249+
imputers.ImputerRegressor(),
250+
imputers.ImputerRPCA(),
251+
imputers.ImputerEM(),
252+
]
253+
)
254+
def test_sklearn_compatible_estimator(estimator: imputers.Imputer, check: Any) -> None:
255+
"""Check compatibility with sklearn, using sklearn estimator checks API."""
256+
check(estimator)

0 commit comments

Comments
 (0)