Skip to content

Commit 3349241

Browse files
Merge pull request #9 from Quantmetry/fix_rpca
Fix rpca
2 parents e22e3af + 7107cba commit 3349241

File tree

6 files changed

+117
-45
lines changed

6 files changed

+117
-45
lines changed

examples/benchmark.md

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -133,14 +133,12 @@ imputer_tsmle = imputers.ImputerEM(groups=["station"], method="VAR1", strategy="
133133

134134
imputer_knn = imputers.ImputerKNN(groups=["station"], k=10)
135135
imputer_iterative = imputers.ImputerMICE(groups=["station"], estimator=LinearRegression(), sample_posterior=False, max_iter=100, missing_values=np.nan)
136-
impute_regressor = imputers.ImputerRegressor(LinearRegression, groups=["station"])
137-
impute_stochastic_regressor = imputers.ImputerStochasticRegressor(
138-
HistGradientBoostingRegressor(), cols_to_impute=cols_to_impute
139-
)
136+
impute_regressor = imputers.ImputerRegressor(groups=["station"], estimator=LinearRegression())
137+
impute_stochastic_regressor = imputers.ImputerStochasticRegressor(groups=["station"], estimator=LinearRegression())
140138

141139
dict_imputers = {
142140
"mean": imputer_mean,
143-
# "median": imputer_median,
141+
"median": imputer_median,
144142
# "mode": imputer_mode,
145143
"interpolation": imputer_interpol,
146144
# "spline": imputer_spline,
@@ -182,7 +180,7 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
182180
Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
183181

184182
```python tags=[]
185-
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=10, groups=["station"], ratio_masked=ratio_masked)
183+
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=["station"], ratio_masked=ratio_masked)
186184

187185
comparison = comparator.Comparator(
188186
dict_imputers,
@@ -245,6 +243,8 @@ for col in cols_to_impute:
245243
```
246244

247245
```python
246+
# plot.plot_imputations(df_station, dfs_imputed_station)
247+
248248
n_columns = len(df_plot.columns)
249249
n_imputers = len(dict_imputers)
250250

@@ -269,7 +269,6 @@ for name_imputer in dict_imputers:
269269
ax.xaxis.set_major_locator(loc)
270270
ax.tick_params(axis='both', which='major', labelsize=17)
271271
i_plot += 1
272-
plt.xlim(0, 100)
273272
plt.savefig("figures/imputations_benchmark.png")
274273
plt.show()
275274

636 KB
Loading

qolmat/__init__.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1+
from . import benchmark, imputations, utils
12
from ._version import __version__
23

3-
from . import utils
4-
54
__all__ = ["utils", "__version__"]

qolmat/imputations/imputers.py

Lines changed: 53 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
1+
import abc
2+
import copy
13
import sys
24
from typing import Any, Dict, List, Optional, Union
35

46
import sklearn.neighbors._base
7+
from sklearn.base import BaseEstimator
58

69
sys.modules["sklearn.neighbors.base"] = sklearn.neighbors._base
710

@@ -20,12 +23,19 @@
2023

2124

2225
class Imputer(_BaseImputer):
23-
def __init__(self, groups: List[str] = [], columnwise: bool = False, hyperparams: Dict = {}):
26+
def __init__(
27+
self,
28+
groups: List[str] = [],
29+
columnwise: bool = False,
30+
shrink: bool = False,
31+
hyperparams: Dict = {},
32+
):
2433
self.hyperparams_user = hyperparams
2534
self.hyperparams_optim = {}
2635
self.hyperparams_local = {}
2736
self.groups = groups
2837
self.columnwise = columnwise
38+
self.shrink = shrink
2939

3040
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
3141
"""
@@ -47,6 +57,12 @@ def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
4757
hyperparams = self.hyperparams_user.copy()
4858
hyperparams.update(self.hyperparams_optim)
4959
cols_with_nans = df.columns[df.isna().any()]
60+
61+
if self.groups == []:
62+
self.ngroups = pd.Series(0, index=df.index).rename("_ngroup")
63+
else:
64+
self.ngroups = df.groupby(self.groups).ngroup().rename("_ngroup")
65+
5066
if self.columnwise:
5167

5268
# imputed = pd.DataFrame(index=df.index, columns=df.columns)
@@ -79,16 +95,20 @@ def impute_element(self, df: pd.DataFrame) -> pd.DataFrame:
7995
df = df.copy()
8096
if self.groups:
8197

82-
groupby = utils.custom_groupby(df, self.groups)
83-
imputation_values = groupby.apply(self.fit_transform_element)
98+
# groupby = utils.custom_groupby(df, self.groups)
99+
groupby = df.groupby(self.ngroups, group_keys=False)
100+
if self.shrink:
101+
imputation_values = groupby.transform(self.fit_transform_element)
102+
else:
103+
imputation_values = groupby.apply(self.fit_transform_element)
84104
else:
85105
imputation_values = self.fit_transform_element(df)
86106

87107
df = df.fillna(imputation_values)
88-
# # fill na by applying imputation method without groups
89-
# if df.isna().any().any():
90-
# imputation_values = self.fit_transform_fallback(df)
91-
# df = df.fillna(imputation_values)
108+
# fill na by applying imputation method without groups
109+
if df.isna().any().any():
110+
imputation_values = self.fit_transform_fallback(df)
111+
df = df.fillna(imputation_values)
92112

93113
return df
94114

@@ -114,7 +134,7 @@ def __init__(
114134
self,
115135
groups: List[str] = [],
116136
) -> None:
117-
super().__init__(groups=groups, columnwise=True)
137+
super().__init__(groups=groups, columnwise=True, shrink=True)
118138
self.fit_transform_element = pd.DataFrame.mean
119139

120140

@@ -139,7 +159,7 @@ def __init__(
139159
self,
140160
groups: List[str] = [],
141161
) -> None:
142-
super().__init__(groups=groups, columnwise=True)
162+
super().__init__(groups=groups, columnwise=True, shrink=True)
143163
self.fit_transform_element = pd.DataFrame.median
144164

145165

@@ -164,7 +184,7 @@ def __init__(
164184
self,
165185
groups: List[str] = [],
166186
) -> None:
167-
super().__init__(groups=groups, columnwise=True)
187+
super().__init__(groups=groups, columnwise=True, shrink=True)
168188
self.fit_transform_element = lambda df: df.mode().iloc[0]
169189

170190

@@ -509,9 +529,11 @@ class ImputerMICE(Imputer):
509529
def __init__(
510530
self,
511531
groups: List[str] = [],
532+
estimator: Optional[BaseEstimator] = None,
512533
**hyperparams,
513534
) -> None:
514535
super().__init__(groups=groups, columnwise=False, hyperparams=hyperparams)
536+
self.estimator = estimator
515537

516538
def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
517539
"""
@@ -530,7 +552,7 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
530552
if not isinstance(df, pd.DataFrame):
531553
raise ValueError("Input has to be a pandas.DataFrame.")
532554

533-
iterative_imputer = IterativeImputer(**self.hyperparams_element)
555+
iterative_imputer = IterativeImputer(estimator=self.estimator, **self.hyperparams_element)
534556
res = iterative_imputer.fit_transform(df.values)
535557
imputed = pd.DataFrame(columns=df.columns)
536558
for ind, col in enumerate(imputed.columns):
@@ -564,11 +586,15 @@ class ImputerRegressor(Imputer):
564586
"""
565587

566588
def __init__(
567-
self, type_model: Any, groups: List[str] = [], fit_on_nan: bool = False, **hyperparams
589+
self,
590+
groups: List[str] = [],
591+
estimator: Optional[BaseEstimator] = None,
592+
fit_on_nan: bool = False,
593+
**hyperparams,
568594
):
569595
super().__init__(groups=groups, hyperparams=hyperparams)
570596
self.columnwise = False
571-
self.type_model = type_model
597+
self.estimator = estimator
572598
self.fit_on_nan = fit_on_nan
573599

574600
def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
@@ -598,7 +624,9 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
598624
value = value[col]
599625
hyperparams[hyperparam] = value
600626

601-
model = self.type_model(**hyperparams)
627+
# model = copy.deepcopy(self.estimator)
628+
# for hyperparam, value in hyperparams.items():
629+
# setattr(model, hyperparam, value)
602630

603631
if self.fit_on_nan:
604632
X = df.drop(columns=col)
@@ -609,8 +637,8 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.DataFrame:
609637
if X.empty:
610638
y_imputed = pd.Series(y.mean(), index=y.index)
611639
else:
612-
model.fit(X[~is_na], y[~is_na])
613-
y_imputed = model.predict(X[is_na])
640+
self.estimator.fit(X[~is_na], y[~is_na])
641+
y_imputed = self.estimator.predict(X[is_na])
614642
df_imputed.loc[is_na, col] = y_imputed
615643

616644
return df_imputed
@@ -632,17 +660,19 @@ class ImputerStochasticRegressor(Imputer):
632660
>>> import pandas as pd
633661
>>> from qolmat.imputations.models import ImputeStochasticRegressor
634662
>>> from sklearn.ensemble import ExtraTreesRegressor
635-
>>> imputor = ImputeStochasticRegressor(model=ExtraTreesRegressor())
663+
>>> imputer = ImputeStochasticRegressor(estimator=ExtraTreesRegressor)
636664
>>> df = pd.DataFrame(data=[[1, 1, 1, 1],
637665
>>> [np.nan, np.nan, 2, 3],
638666
>>> [1, 2, 2, 5], [2, 2, 2, 2]],
639667
>>> columns=["var1", "var2", "var3", "var4"])
640-
>>> imputor.fit_transform(df)
668+
>>> imputer.fit_transform(df)
641669
"""
642670

643-
def __init__(self, type_model: str, groups: List[str] = [], **hyperparams) -> None:
671+
def __init__(
672+
self, groups: List[str] = [], estimator: Optional[BaseEstimator] = None, **hyperparams
673+
) -> None:
644674
super().__init__(groups=groups, hyperparams=hyperparams)
645-
self.type_model = type_model
675+
self.estimator = estimator
646676

647677
def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
648678
"""
@@ -659,7 +689,6 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
659689
imputed dataframe
660690
"""
661691
df_imp = df.copy()
662-
model = self.type_model(**self.hyperparams)
663692
cols_with_nans = df.columns[df.isna().any()]
664693
cols_without_nans = df.columns[df.notna().all()]
665694

@@ -670,8 +699,8 @@ def fit_transform_element(self, df: pd.DataFrame) -> pd.Series:
670699
X = df[cols_without_nans]
671700
y = df[col]
672701
is_na = y.isna()
673-
model.fit(X[~is_na], y[~is_na])
674-
y_pred = model.predict(X)
702+
self.estimator.fit(X[~is_na], y[~is_na])
703+
y_pred = self.estimator.predict(X)
675704
std_error = (y_pred[~is_na] - y[~is_na]).std()
676705
random_pred = np.random.normal(size=len(y), loc=y_pred, scale=std_error)
677706
df_imp.loc[is_na, col] = random_pred[is_na]
@@ -696,8 +725,8 @@ class ImputerRPCA(Imputer):
696725

697726
def __init__(
698727
self,
699-
method: str = "noisy",
700728
groups: List[str] = [],
729+
method: str = "noisy",
701730
columnwise: bool = False,
702731
**hyperparams,
703732
) -> None:

qolmat/utils/data.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def get_data(name_data="Beijing", datapath: str = "data/", download: Optional[bo
8282

8383

8484
def preprocess_data(df: pd.DataFrame):
85-
"""Put data into dataframe
85+
"""Preprocess data from the "Beijing" datset
8686
8787
Parameters
8888
----------
@@ -106,14 +106,14 @@ def preprocess_data(df: pd.DataFrame):
106106
return df
107107

108108

109-
def add_holes(X: pd.DataFrame, ratio_masked: float, mean_size: int):
109+
def add_holes(df: pd.DataFrame, ratio_masked: float, mean_size: int):
110110
"""
111-
Creates holes in a dataset with no missing value. Only used in the documentation to design
111+
Creates holes in a dataset with no missing value, starting from `df`. Only used in the documentation to design
112112
examples.
113113
114114
Parameters
115115
----------
116-
X : pd.DataFrame
116+
df : pd.DataFrame
117117
dataframe no missing values
118118
119119
mean_size : int
@@ -130,18 +130,18 @@ def add_holes(X: pd.DataFrame, ratio_masked: float, mean_size: int):
130130
pd.DataFrame
131131
dataframe with missing values
132132
"""
133-
groups = X.index.names.difference(["datetime", "date", "index"])
133+
groups = df.index.names.difference(["datetime", "date", "index"])
134134
generator = missing_patterns.GeometricHoleGenerator(
135-
1, ratio_masked=ratio_masked, subset=X.columns, groups=groups
135+
1, ratio_masked=ratio_masked, subset=df.columns, groups=groups
136136
)
137137

138-
generator.dict_probas_out = {column: 1 / mean_size for column in X.columns}
139-
generator.dict_ratios = {column: 1 / len(X.columns) for column in X.columns}
138+
generator.dict_probas_out = {column: 1 / mean_size for column in df.columns}
139+
generator.dict_ratios = {column: 1 / len(df.columns) for column in df.columns}
140140
if generator.groups == []:
141-
mask = generator.generate_mask(X)
141+
mask = generator.generate_mask(df)
142142
else:
143-
mask = X.groupby(groups, group_keys=False).apply(generator.generate_mask)
144-
X_with_nans = X.copy()
143+
mask = df.groupby(groups, group_keys=False).apply(generator.generate_mask)
144+
X_with_nans = df.copy()
145145
X_with_nans[mask] = np.nan
146146
return X_with_nans
147147

@@ -151,6 +151,22 @@ def get_data_corrupted(
151151
mean_size: int = 90,
152152
ratio_masked: float = 0.2,
153153
):
154+
"""
155+
Returns a dataframe with controled corruption optained from the source `name_data`
156+
157+
Parameters
158+
----------
159+
name_data : str
160+
Name of the data source, can be "Beijing" or "Artificial"
161+
mean_size: int
162+
Mean size of the holes to be generated using a geometric law
163+
ratio_masked: float
164+
Percent of missing data in each column in the output dataframe
165+
Returns
166+
-------
167+
pd.DataFrame
168+
Dataframe with missing values
169+
"""
154170
df = get_data(name_data)
155171
df = add_holes(df, mean_size=mean_size, ratio_masked=ratio_masked)
156172
return df

qolmat/utils/plot.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,11 @@
44

55
from __future__ import annotations
66

7-
from typing import List, Optional, Tuple, Union
7+
from typing import Dict, List, Optional, Tuple, Union
88

99
import matplotlib as mpl
1010
import matplotlib.pyplot as plt
11+
import matplotlib.ticker as plticker
1112
import numpy as np
1213
import pandas as pd
1314
import scipy
@@ -256,3 +257,31 @@ def multibar(df, ax=None, orientation="vertical", colors=None, decimals=0):
256257
# ax.bar_label(rects2, padding=3)
257258

258259
# plt.tight_layout()
260+
261+
262+
def plot_imputations(df: pd.DataFrame, dict_df_imputed: Dict[str, pd.DataFrame]):
263+
n_columns = len(df.columns)
264+
n_imputers = len(dict_df_imputed)
265+
266+
fig = plt.figure(figsize=(8 * n_columns, 6 * n_imputers))
267+
i_plot = 1
268+
for name_imputer, df_imputed in dict_df_imputed.items():
269+
for col in df:
270+
271+
ax = fig.add_subplot(n_imputers, n_columns, i_plot)
272+
values_orig = df[col]
273+
274+
plt.plot(values_orig, ".", color="black", label="original")
275+
# plt.plot(df.iloc[870:1000][col], markers[0], color='k', linestyle='-' , ms=3)
276+
277+
values_imp = df_imputed[col].copy()
278+
values_imp[values_orig.notna()] = np.nan
279+
plt.plot(values_imp, ".", color=tab10(0), label=name_imputer, alpha=1)
280+
plt.ylabel(col, fontsize=16)
281+
if i_plot % n_columns == 0:
282+
plt.legend(loc=[1, 0], fontsize=18)
283+
loc = plticker.MultipleLocator(base=2 * 365)
284+
ax.xaxis.set_major_locator(loc)
285+
ax.tick_params(axis="both", which="major", labelsize=17)
286+
i_plot += 1
287+
plt.show()

0 commit comments

Comments
 (0)