Skip to content

Commit e2204ad

Browse files
Merge pull request #65 from Quantmetry/review_dcor
grouped imputation tested, metric_optim argument added to comparator
2 parents c149cd7 + aa3b00f commit e2204ad

File tree

4 files changed

+106
-63
lines changed

4 files changed

+106
-63
lines changed

qolmat/benchmark/comparator.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ def __init__(
3434
generator_holes: _HoleGenerator,
3535
metrics: List = ["mae", "wmape", "KL_columnwise"],
3636
dict_config_opti: Optional[Dict[str, Any]] = {},
37+
metric_optim: str = "mse",
3738
max_evals: int = 10,
3839
verbose: bool = False,
3940
):
@@ -42,6 +43,7 @@ def __init__(
4243
self.generator_holes = generator_holes
4344
self.metrics = metrics
4445
self.dict_config_opti = dict_config_opti
46+
self.metric_optim = metric_optim
4547
self.max_evals = max_evals
4648
self.verbose = verbose
4749

@@ -77,6 +79,7 @@ def evaluate_errors_sample(
7779
imputer: Any,
7880
df: pd.DataFrame,
7981
dict_config_opti_imputer: Dict[str, Any] = {},
82+
metric_optim: str = "mse",
8083
) -> pd.Series:
8184
"""Evaluate the errors in the cross-validation
8285
@@ -88,6 +91,8 @@ def evaluate_errors_sample(
8891
dataframe to impute
8992
dict_config_opti_imputer : Dict
9093
search space for tested_model's hyperparameters
94+
metric_optim : str
95+
Loss function used when imputers undergo hyperparameter optimization
9196
9297
Returns
9398
-------
@@ -99,7 +104,6 @@ def evaluate_errors_sample(
99104
for df_mask in self.generator_holes.split(df_origin):
100105
df_corrupted = df_origin.copy()
101106
df_corrupted[df_mask] = np.nan
102-
metric_optim = "mae"
103107
imputer_opti = hyperparameters.optimize(
104108
imputer,
105109
df,
@@ -142,7 +146,7 @@ def compare(
142146

143147
try:
144148
dict_errors[name] = self.evaluate_errors_sample(
145-
imputer, df, dict_config_opti_imputer
149+
imputer, df, dict_config_opti_imputer, self.metric_optim
146150
)
147151
print(f"Tested model: {type(imputer).__name__}")
148152
except Exception as excp:

qolmat/imputations/em_sampler.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -177,18 +177,22 @@ def fit(self, X: NDArray):
177177
mask_na = np.isnan(X)
178178

179179
# first imputation
180-
X_sample_last = utils.linear_interpolation(X)
181-
self.fit_distribution(X_sample_last)
180+
X = utils.linear_interpolation(X)
181+
print("fit")
182+
print(X)
183+
print("fit_distribution")
184+
self.fit_distribution(X)
185+
print("...")
182186

183187
for iter_em in range(self.max_iter_em):
184-
X_sample_last = self._sample_ou(X_sample_last, mask_na)
188+
X = self._sample_ou(X, mask_na)
185189

186190
if self._check_convergence():
187191
# print(f"EM converged after {iter_em} iterations.")
188192
break
189193

190194
self.dict_criteria_stop = {key: [] for key in self.dict_criteria_stop}
191-
self.X_sample_last = X_sample_last
195+
self.X_sample_last = X
192196
return self
193197

194198
def transform(self, X: NDArray) -> NDArray:
@@ -314,7 +318,9 @@ def get_loglikelihood(self, X: NDArray) -> float:
314318
if np.all(np.isclose(self.cov, 0)):
315319
return 0
316320
else:
317-
return scipy.stats.multivariate_normal.logpdf(X.T, self.means, self.cov).mean()
321+
return scipy.stats.multivariate_normal.logpdf(
322+
X.T, self.means, self.cov, allow_singular=True
323+
).mean()
318324

319325
def _maximize_likelihood(self, X: NDArray, mask_na: NDArray, dt: float = np.nan) -> NDArray:
320326
"""

qolmat/imputations/imputers.py

Lines changed: 56 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1223,10 +1223,8 @@ def fit(self, X: pd.DataFrame, y=None):
12231223
Returns self.
12241224
"""
12251225
super().fit(X)
1226-
df = self._check_input(X)
12271226
hyperparameters = self.get_hyperparams()
12281227
self.imputer_ = KNNImputer(metric="nan_euclidean", **hyperparameters)
1229-
self.imputer_.fit(df)
12301228
return self
12311229

12321230
def _transform_element(self, df: pd.DataFrame, col: str = "__all__") -> pd.DataFrame:
@@ -1252,7 +1250,7 @@ def _transform_element(self, df: pd.DataFrame, col: str = "__all__") -> pd.DataF
12521250
Input has to be a pandas.DataFrame.
12531251
"""
12541252
self._check_dataframe(df)
1255-
results = self.imputer_.transform(df)
1253+
results = self.imputer_.fit_transform(df)
12561254
return pd.DataFrame(data=results, columns=df.columns, index=df.index)
12571255

12581256

@@ -1329,10 +1327,10 @@ def fit(self, X: pd.DataFrame, y=None):
13291327
"""
13301328
hyperparams = self.get_hyperparams()
13311329
super().fit(X)
1332-
df = self._check_input(X)
13331330
self.imputer_ = IterativeImputer(estimator=self.estimator, **hyperparams)
1334-
self.imputer_.fit(df)
1335-
self.n_iter_ = self.imputer_.n_iter_
1331+
self.n_iter_ = 1
1332+
# requires fitting IterativeImputer in the fit method
1333+
# self.n_iter_ = self.imputer_.n_iter_
13361334
return self
13371335

13381336
def _transform_element(self, df: pd.DataFrame, col: str = "__all__") -> pd.DataFrame:
@@ -1358,12 +1356,10 @@ def _transform_element(self, df: pd.DataFrame, col: str = "__all__") -> pd.DataF
13581356
Input has to be a pandas.DataFrame.
13591357
"""
13601358
self._check_dataframe(df)
1361-
res = self.imputer_.transform(df)
1362-
imputed = pd.DataFrame(columns=df.columns)
1363-
for ind, col in enumerate(imputed.columns):
1364-
imputed[col] = res[:, ind]
1365-
imputed.index = df.index
1366-
return imputed
1359+
X_imputed = self.imputer_.fit_transform(df)
1360+
df_imputed = pd.DataFrame(X_imputed, index=df.index, columns=df.columns)
1361+
1362+
return df_imputed
13671363

13681364

13691365
class ImputerRegressor(_Imputer):
@@ -1727,47 +1723,47 @@ def get_model(self, **hyperparams) -> em_sampler.EM:
17271723
" Valid values are `multinormal`and `VAR`."
17281724
)
17291725

1730-
def fit(self, X: pd.DataFrame, y=None):
1731-
"""Fit the imputer on X.
1732-
1733-
Parameters
1734-
----------
1735-
X : pd.DataFrame
1736-
Data matrix on which the Imputer must be fitted.
1737-
1738-
Returns
1739-
-------
1740-
self : Self
1741-
Returns self.
1742-
"""
1743-
super().fit(X)
1744-
df = self._check_input(X)
1745-
1746-
# n_rows, n_cols = df.shape
1747-
# if n_rows == 1:
1748-
# raise ValueError("n_samples=1 is not allowed!")
1749-
1750-
if self.model not in ["multinormal", "VAR1"]:
1751-
raise ValueError(
1752-
f"Model argument `{self.model}` is invalid!"
1753-
" Valid values are `multinormal`and `VAR`."
1754-
)
1755-
1756-
cols_with_nans = df.columns[df.isna().any()]
1757-
1758-
self._models = {}
1759-
if self.columnwise:
1760-
for col in cols_with_nans:
1761-
hyperparams = self.get_hyperparams(col=col)
1762-
model = self.get_model(**hyperparams)
1763-
model.fit(df[col].values)
1764-
self._models[col] = model
1765-
else:
1766-
hyperparams = self.get_hyperparams()
1767-
model = self.get_model(**hyperparams)
1768-
model.fit(df.values.T)
1769-
self._models["__all__"] = model
1770-
return self
1726+
# def fit(self, X: pd.DataFrame, y=None):
1727+
# """Fit the imputer on X.
1728+
1729+
# Parameters
1730+
# ----------
1731+
# X : pd.DataFrame
1732+
# Data matrix on which the Imputer must be fitted.
1733+
1734+
# Returns
1735+
# -------
1736+
# self : Self
1737+
# Returns self.
1738+
# """
1739+
# super().fit(X)
1740+
# df = self._check_input(X)
1741+
1742+
# # n_rows, n_cols = df.shape
1743+
# # if n_rows == 1:
1744+
# # raise ValueError("n_samples=1 is not allowed!")
1745+
1746+
# if self.model not in ["multinormal", "VAR1"]:
1747+
# raise ValueError(
1748+
# f"Model argument `{self.model}` is invalid!"
1749+
# " Valid values are `multinormal`and `VAR`."
1750+
# )
1751+
1752+
# cols_with_nans = df.columns[df.isna().any()]
1753+
1754+
# self._models = {}
1755+
# if self.columnwise:
1756+
# for col in cols_with_nans:
1757+
# hyperparams = self.get_hyperparams(col=col)
1758+
# model = self.get_model(**hyperparams)
1759+
# model.fit(df[col].values)
1760+
# self._models[col] = model
1761+
# else:
1762+
# hyperparams = self.get_hyperparams()
1763+
# model = self.get_model(**hyperparams)
1764+
# model.fit(df.values.T)
1765+
# self._models["__all__"] = model
1766+
# return self
17711767

17721768
def _transform_element(self, df: pd.DataFrame, col: str = "__all__") -> pd.DataFrame:
17731769
"""
@@ -1792,7 +1788,13 @@ def _transform_element(self, df: pd.DataFrame, col: str = "__all__") -> pd.DataF
17921788
Input has to be a pandas.DataFrame.
17931789
"""
17941790
self._check_dataframe(df)
1795-
model = self._models[col]
1791+
1792+
hyperparams = self.get_hyperparams(col=col)
1793+
model = self.get_model(**hyperparams)
1794+
if col == "__all__":
1795+
model.fit(df.values.T)
1796+
else:
1797+
model.fit(df[col].values)
17961798

17971799
X = df.values.T.astype(float)
17981800
X_imputed = model.transform(X)

tests/imputations/test_imputers.py

Lines changed: 33 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
import numpy as np
44
import pandas as pd
55
import pytest
6-
from sklearn.ensemble import ExtraTreesRegressor
76
from sklearn.utils.estimator_checks import check_estimator, parametrize_with_checks
87
from qolmat.benchmark.hyperparameters import HyperValue
8+
from sklearn.linear_model import LinearRegression
9+
from sklearn.ensemble import ExtraTreesRegressor
10+
911

1012
from qolmat.imputations import imputers
1113

@@ -286,6 +288,35 @@ def test_ImputerEM_fit_transform(df: pd.DataFrame) -> None:
286288
np.testing.assert_allclose(result, expected, atol=1e-2)
287289

288290

291+
index_grouped = pd.MultiIndex.from_product([["a", "b"], range(4)], names=["group", "date"])
292+
dict_values = {"col1": [0, np.nan, 0, 0, 1, 1, 1, 1], "col2": np.arange(8)}
293+
df_grouped = pd.DataFrame(dict_values, index=index_grouped)
294+
295+
list_imputers = [
296+
imputers.ImputerMean(groups=("group",)),
297+
imputers.ImputerMedian(groups=("group",)),
298+
imputers.ImputerMode(groups=("group",)),
299+
imputers.ImputerShuffle(groups=("group",)),
300+
imputers.ImputerLOCF(groups=("group",)),
301+
imputers.ImputerNOCB(groups=("group",)),
302+
imputers.ImputerInterpolation(groups=("group",)),
303+
imputers.ImputerResiduals(groups=("group",), period=2),
304+
imputers.ImputerKNN(groups=("group",)),
305+
imputers.ImputerMICE(groups=("group",)),
306+
imputers.ImputerRegressor(groups=("group",), estimator=LinearRegression()),
307+
imputers.ImputerRPCA(groups=("group",)),
308+
imputers.ImputerEM(groups=("group",)),
309+
]
310+
311+
312+
@pytest.mark.parametrize("imputer", list_imputers)
313+
def test_models_fit_transform_grouped(imputer):
314+
# imputer = imputers.ImputerEM(groups=("group",), method="sample", random_state=42)
315+
result = imputer.fit_transform(df_grouped)
316+
expected = df_grouped.fillna(0)
317+
np.testing.assert_allclose(result, expected)
318+
319+
289320
@parametrize_with_checks(
290321
[
291322
imputers._Imputer(),
@@ -298,7 +329,7 @@ def test_ImputerEM_fit_transform(df: pd.DataFrame) -> None:
298329
imputers.ImputerNOCB(),
299330
imputers.ImputerInterpolation(),
300331
imputers.ImputerResiduals(period=2),
301-
imputers.KNNImputer(),
332+
imputers.ImputerKNN(),
302333
imputers.ImputerMICE(),
303334
imputers.ImputerRegressor(),
304335
imputers.ImputerRPCA(tau=0, lam=0),

0 commit comments

Comments
 (0)