Skip to content

Commit 65ff5fb

Browse files
Julien RousselJulien Roussel
authored andcommitted
RPCA uniformized, online not functional
1 parent c470767 commit 65ff5fb

File tree

14 files changed

+605
-515
lines changed

14 files changed

+605
-515
lines changed

examples/1_timeSeries.ipynb

Lines changed: 132 additions & 127 deletions
Large diffs are not rendered by default.

qolmat/benchmark/comparator.py

Lines changed: 47 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,17 @@
1+
import logging
12
from typing import Dict, List, Optional, Union
23

34
import numpy as np
45
import pandas as pd
56

7+
from qolmat import logging as qlog
68
from qolmat.benchmark import cross_validation, utils
79
from qolmat.benchmark.missing_patterns import _HoleGenerator
810

11+
qlog.log_setup()
12+
logger = logging.getLogger(__name__)
13+
# logger.setLevel(logging.DEBUG)
14+
915

1016
class Comparator:
1117
"""
@@ -22,8 +28,8 @@ class Comparator:
2228
search_params: Optional[Dict[str, Dict[str, Union[str, float, int]]]] = {}
2329
dictionary of search space for each implementation method. By default, the value is set to
2430
{}.
25-
n_cv_calls: Optional[int] = 10
26-
number of calls of the hyperparameters cross-validation. By default, the value is set to
31+
n_calls_opt: Optional[int] = 10
32+
number of calls of the optimization algorithm
2733
10.
2834
"""
2935

@@ -33,18 +39,18 @@ def __init__(
3339
selected_columns: List[str],
3440
generator_holes: _HoleGenerator,
3541
search_params: Optional[Dict[str, Dict[str, Union[float, int, str]]]] = {},
36-
n_cv_calls: Optional[int] = 10,
42+
n_calls_opt: Optional[int] = 10,
3743
):
3844

3945
self.dict_models = dict_models
4046
self.selected_columns = selected_columns
4147
self.generator_holes = generator_holes
4248
self.search_params = search_params
43-
self.n_cv_calls = n_cv_calls
49+
self.n_calls_opt = n_calls_opt
4450

4551
def get_errors(
4652
self, df_origin: pd.DataFrame, df_imputed: pd.DataFrame, df_mask: pd.DataFrame
47-
) -> float:
53+
) -> pd.DataFrame:
4854
"""Functions evaluating the reconstruction's quality
4955
5056
Parameters
@@ -73,6 +79,7 @@ def get_errors(
7379
df_origin[df_mask],
7480
df_imputed[df_mask],
7581
)
82+
7683
dict_errors["kl"] = utils.kl_divergence(
7784
df_origin[df_mask],
7885
df_imputed[df_mask],
@@ -82,8 +89,8 @@ def get_errors(
8289
return errors
8390

8491
def evaluate_errors_sample(
85-
self, tested_model: any, df: pd.DataFrame, search_space: Optional[dict] = None
86-
) -> Dict:
92+
self, imputer: any, df: pd.DataFrame, list_spaces: List[Dict] = {}
93+
) -> pd.Series:
8794
"""Evaluate the errors in the cross-validation
8895
8996
Parameters
@@ -92,8 +99,8 @@ def evaluate_errors_sample(
9299
imputation model
93100
df : pd.DataFrame
94101
dataframe to impute
95-
search_space : Optional[dict], optional
96-
search space for tested_model's hyperparameters, by default None
102+
search_space : Dict
103+
search space for tested_model's hyperparameters
97104
98105
Returns
99106
-------
@@ -102,19 +109,24 @@ def evaluate_errors_sample(
102109
"""
103110
list_errors = []
104111
df_origin = df[self.selected_columns].copy()
112+
if list_spaces:
113+
print("Hyperparameter optimization")
114+
print(list_spaces)
115+
else:
116+
print("No hyperparameter optimization")
105117
for df_mask in self.generator_holes.split(df_origin):
106118
df_corrupted = df_origin.copy()
107119
df_corrupted[df_mask] = np.nan
108-
if search_space is None:
109-
df_imputed = tested_model.fit_transform(X=df_corrupted)
110-
else:
120+
if list_spaces:
111121
cv = cross_validation.CrossValidation(
112-
tested_model,
113-
search_space=search_space,
122+
imputer,
123+
list_spaces=list_spaces,
114124
hole_generator=self.generator_holes,
115-
n_calls=self.n_cv_calls,
125+
n_calls=self.n_calls_opt,
116126
)
117-
df_imputed = cv.fit_transform(X=df_corrupted)
127+
df_imputed = cv.fit_transform(df_corrupted)
128+
else:
129+
df_imputed = imputer.fit_transform(df_corrupted)
118130

119131
subset = self.generator_holes.subset
120132
errors = self.get_errors(df_origin[subset], df_imputed[subset], df_mask[subset])
@@ -140,30 +152,29 @@ def compare(self, df: pd.DataFrame, verbose: bool = True):
140152

141153
dict_errors = {}
142154

143-
for name, tested_model in self.dict_models.items():
144-
if verbose:
145-
print("Tested model:", type(tested_model).__name__)
155+
for name, imputer in self.dict_models.items():
156+
logger.setLevel(logging.DEBUG)
157+
print(f"Tested model: {type(imputer).__name__}")
158+
159+
search_params = self.search_params.get(name, {})
146160

147-
if str(type(tested_model).__name__) in self.search_params.keys():
148-
if hasattr(tested_model, "columnwise") and tested_model.columnwise:
149-
if len(self.selected_columns) > 0:
150-
search_params = {}
151-
for col in self.selected_columns:
152-
for key, value in self.search_params[type(tested_model).__name__].items():
153-
search_params[f"('{col}', '{key}')"] = value
154-
else:
155-
search_params = self.search_params[type(tested_model).__name__]
156-
else:
157-
search_params = self.search_params[type(tested_model).__name__]
158-
159-
search_space = utils.get_search_space(tested_model, search_params)
161+
# if imputer.columnwise:
162+
# if len(self.selected_columns) > 0:
163+
# search_params = {}
164+
# for col in self.selected_columns:
165+
# for key, value in self.search_params[type(imputer).__name__].items():
166+
# search_params[f"('{col}', '{key}')"] = value
167+
# else:
168+
# search_params = self.search_params[type(imputer).__name__]
169+
# else:
170+
# search_params = self.search_params[type(imputer).__name__]
171+
172+
list_spaces = utils.get_search_space(search_params)
160173

161-
else:
162-
search_space = None
163174
try:
164-
dict_errors[name] = self.evaluate_errors_sample(tested_model, df, search_space)
175+
dict_errors[name] = self.evaluate_errors_sample(imputer, df, list_spaces)
165176
except Exception as excp:
166-
print("Error while testing ", type(tested_model).__name__)
177+
print("Error while testing ", type(imputer).__name__)
167178
raise excp
168179

169180
df_errors = pd.DataFrame(dict_errors)

qolmat/benchmark/cross_validation.py

Lines changed: 49 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,16 @@
1+
import logging
12
from typing import Dict, List, Optional, Union
23

34
import numpy as np
45
import pandas as pd
56
import skopt
7+
from skopt.space import Dimension
68

79
from qolmat.benchmark.missing_patterns import _HoleGenerator
810

11+
logger = logging.getLogger(__name__)
12+
logger.setLevel(logging.DEBUG)
13+
914

1015
class CrossValidation:
1116
"""
@@ -37,16 +42,16 @@ class CrossValidation:
3742

3843
def __init__(
3944
self,
40-
model: any,
41-
search_space: List[skopt.Space],
45+
imputer: any,
46+
list_spaces: List[Dimension],
4247
hole_generator: _HoleGenerator,
4348
n_calls: int = 10,
4449
n_jobs: int = -1,
4550
loss_norm: int = 1,
4651
verbose: bool = True,
4752
):
48-
self.model = model
49-
self.search_space = search_space
53+
self.imputer = imputer
54+
self.list_spaces = list_spaces
5055
self.hole_generator = hole_generator
5156
self.n_calls = n_calls
5257
self.n_jobs = n_jobs
@@ -85,17 +90,26 @@ def loss_function(
8590
else:
8691
raise ValueError("loss_norm has to be 0 or 1 (int)")
8792

88-
def _set_params(self, all_params: Dict[str, Union[float, int, str]]):
93+
def deflat_hyperparams(self, hyperparams_flat: Dict[str, Union[float, int, str]]) -> Dict:
8994
"""
9095
Set the hyperparameters to the model
9196
9297
Parameters
9398
----------
94-
all_params : Dict[str, Union[int, float, str]]
99+
hyperparams_flat : Dict[str, Union[int, float, str]]
95100
dictionary containing the hyperparameters and their value
96101
"""
97-
self.model.set_params(**all_params)
98-
return self
102+
hyperparams = {}
103+
for name_dimension, hyperparam in hyperparams_flat.items():
104+
if "/" not in name_dimension:
105+
hyperparams[name_dimension] = hyperparam
106+
else:
107+
name_hyperparam, col = name_dimension.split("/")
108+
if name_hyperparam in hyperparams:
109+
hyperparams[name_hyperparam][col] = hyperparam
110+
else:
111+
hyperparams[name_hyperparam] = {col: hyperparam}
112+
return hyperparams
99113

100114
def objective(self, X):
101115
"""
@@ -106,11 +120,9 @@ def objective(self, X):
106120
_type_
107121
objective function
108122
"""
109-
@skopt.utils.use_named_args(self.search_space)
110-
def obj_func(**all_params):
111-
self._set_params(all_params=all_params)
112-
if self.verbose:
113-
print(all_params)
123+
@skopt.utils.use_named_args(self.list_spaces)
124+
def obj_func(**hyperparams_flat):
125+
self.imputer.hyperparams_optim = self.deflat_hyperparams(hyperparams_flat)
114126

115127
errors = []
116128

@@ -119,7 +131,7 @@ def obj_func(**all_params):
119131
df_corrupted = df_origin.copy()
120132
df_corrupted[df_mask] = np.nan
121133
cols_with_nans = X.columns[X.isna().any(axis=0)].tolist()
122-
imputed = self.model.fit_transform(df_corrupted)
134+
imputed = self.imputer.fit_transform(df_corrupted)
123135

124136
error = self.loss_function(
125137
df_origin.loc[:, cols_with_nans],
@@ -129,14 +141,12 @@ def obj_func(**all_params):
129141
errors.append(error)
130142

131143
mean_errors = np.mean(errors)
132-
if self.verbose:
133-
print(mean_errors)
134144
return mean_errors
135145

136146
return obj_func
137147

138148
def fit_transform(
139-
self, X: pd.DataFrame, return_hyper_params: Optional[bool] = False
149+
self, df: pd.DataFrame, return_hyper_params: Optional[bool] = False
140150
) -> pd.DataFrame:
141151
"""
142152
Fit and transform estimator and impute the missing values.
@@ -154,24 +164,36 @@ def fit_transform(
154164
imputed dataframe
155165
"""
156166

157-
n0 = self.n_calls//5 if (self.n_calls//5) >= 1 else self.n_calls
167+
n0 = max(5, self.n_calls // 5)
168+
print("---")
169+
print(self.n_calls)
170+
print(n0)
171+
172+
# res = skopt.gp_minimize(
173+
# self.objective(X=df),
174+
# dimensions=self.list_spaces,
175+
# n_calls=self.n_calls,
176+
# n_initial_points=n0,
177+
# random_state=42,
178+
# n_jobs=self.n_jobs,
179+
# )
158180

159181
res = skopt.gp_minimize(
160-
self.objective(X=X),
161-
dimensions=self.search_space,
182+
self.objective(X=df),
183+
dimensions=self.list_spaces,
162184
n_calls=self.n_calls,
163-
n_initial_points= n0,
185+
n_initial_points=n0,
164186
random_state=42,
165187
n_jobs=self.n_jobs,
166188
)
167189

168-
best_params = {
169-
self.search_space[param].name: res["x"][param] for param in range(len(res["x"]))
170-
}
190+
hyperparams_flat = {space.name: val for space, val in zip(self.list_spaces, res["x"])}
191+
print(f"Optimal hyperparameters : {hyperparams_flat}")
192+
print(f"Results: {res}")
171193

172-
self._set_params(all_params=best_params)
173-
df_imputed = self.model.fit_transform(X=X)
194+
self.imputer.hyperparams_optim = self.deflat_hyperparams(hyperparams_flat)
195+
df_imputed = self.imputer.fit_transform(df)
174196

175197
if return_hyper_params:
176-
return df_imputed, best_params
198+
return df_imputed, hyperparams_flat
177199
return df_imputed

qolmat/benchmark/missing_patterns.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ def fit(self, X: pd.DataFrame) -> _HoleGenerator:
8989
if self.groups == []:
9090
self.ngroups = None
9191
else:
92-
self.ngroups = X.groupby(self.groups).ngroup()
92+
self.ngroups = X.groupby(self.groups).ngroup().rename("_ngroup")
9393

9494
return self
9595

@@ -395,10 +395,11 @@ def __init__(
395395
groups=groups,
396396
)
397397

398-
def compute_distribution_holes(self, states):
398+
def compute_distribution_holes(self, states: pd.Series) -> pd.Series:
399399
series_id = (states.diff() != 0).cumsum()
400400
series_id = series_id[states]
401401
distribution_holes = series_id.value_counts().value_counts()
402+
distribution_holes.index.name = "_size_hole"
402403
# distribution_holes /= distribution_holes.sum()
403404
return distribution_holes
404405

@@ -428,7 +429,7 @@ def fit(self, X: pd.DataFrame) -> EmpiricalHoleGenerator:
428429
distributions_holes = states.groupby(self.ngroups).apply(
429430
self.compute_distribution_holes
430431
)
431-
distributions_holes = distributions_holes.groupby(level=0).sum()
432+
distributions_holes = distributions_holes.groupby(by="_size_hole").sum()
432433
self.dict_distributions_holes[column] = distributions_holes
433434

434435
def sample_sizes(self, column, n_masked):

0 commit comments

Comments
 (0)