Skip to content

Commit 6aca955

Browse files
author
Gsaes
committed
Modification du code
1 parent 2931ae3 commit 6aca955

File tree

9 files changed

+210
-204
lines changed

9 files changed

+210
-204
lines changed

qolmat/benchmark/comparator.py

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
1-
import logging
21
from functools import partial
3-
from typing import Any, Dict, List, Optional, Union
2+
from typing import Any, Dict, List, Optional
43

54
import numpy as np
65
import pandas as pd
@@ -21,7 +20,7 @@ class Comparator:
2120
list of column's names selected (all with at least one null value will be imputed)
2221
columnwise_evaluation : Optional[bool], optional
2322
whether the metric should be calculated column-wise or not, by default False
24-
search_params: Optional[Dict[str, Dict[str, Union[str, float, int]]]] = {}
23+
dict_config_opti: Optional[Dict[str, Dict[str, Union[str, float, int]]]] = {}
2524
dictionary of search space for each implementation method. By default, the value is set to
2625
{}.
2726
n_calls_opt: int = 10
@@ -50,14 +49,14 @@ def __init__(
5049
selected_columns: List[str],
5150
generator_holes: _HoleGenerator,
5251
metrics: List = ["mae", "wmape", "KL_columnwise"],
53-
search_params: Optional[Dict] = {},
52+
dict_config_opti: Optional[Dict[str, Any]] = {},
5453
n_calls_opt: int = 10,
5554
):
5655
self.dict_imputers = dict_models
5756
self.selected_columns = selected_columns
5857
self.generator_holes = generator_holes
5958
self.metrics = metrics
60-
self.search_params = search_params
59+
self.dict_config_opti = dict_config_opti
6160
self.n_calls_opt = n_calls_opt
6261

6362
def get_errors(
@@ -92,7 +91,7 @@ def evaluate_errors_sample(
9291
self,
9392
imputer: Any,
9493
df: pd.DataFrame,
95-
list_spaces: List[Dict] = [],
94+
dict_config_opti_imputer: Dict[str, Any] = {},
9695
) -> pd.Series:
9796
"""Evaluate the errors in the cross-validation
9897
@@ -115,10 +114,10 @@ def evaluate_errors_sample(
115114
for df_mask in self.generator_holes.split(df_origin):
116115
df_corrupted = df_origin.copy()
117116
df_corrupted[df_mask] = np.nan
118-
if list_spaces:
117+
if dict_config_opti_imputer:
119118
cv = cross_validation.CrossValidation(
120119
imputer,
121-
list_spaces=list_spaces,
120+
dict_config_opti_imputer=dict_config_opti_imputer,
122121
hole_generator=self.generator_holes,
123122
n_calls=self.n_calls_opt,
124123
)
@@ -153,11 +152,12 @@ def compare(
153152
dict_errors = {}
154153

155154
for name, imputer in self.dict_imputers.items():
156-
search_params = self.search_params.get(name, {})
157-
list_spaces = utils.get_search_space(search_params)
155+
dict_config_opti_imputer = self.dict_config_opti.get(name, {})
158156

159157
try:
160-
dict_errors[name] = self.evaluate_errors_sample(imputer, df, list_spaces)
158+
dict_errors[name] = self.evaluate_errors_sample(
159+
imputer, df, dict_config_opti_imputer
160+
)
161161
print(f"Tested model: {type(imputer).__name__}")
162162
except Exception as excp:
163163
print("Error while testing ", type(imputer).__name__)
Lines changed: 129 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,113 @@
11
import logging
2-
from typing import Any, Dict, List, Optional, Union
3-
2+
from typing import Any, Callable, Dict, List, Union
3+
from skopt.space import Categorical, Dimension, Integer, Real
44
import numpy as np
55
import pandas as pd
66
import skopt
7-
from skopt.space import Dimension
87

98
from qolmat.benchmark.missing_patterns import _HoleGenerator
109

1110
logger = logging.getLogger(__name__)
1211
logger.setLevel(logging.DEBUG)
1312

1413

14+
def get_dimension(dict_bounds: Dict, name_dimension: str) -> Dimension:
15+
"""Get the dimension of hyperparamaters with skopt
16+
17+
Parameters
18+
----------
19+
dict_bounds : Dict
20+
Dictionnay of bounds of hyperparameters
21+
name_dimension : str
22+
Name of hyperparameters
23+
24+
Returns
25+
-------
26+
Dimension
27+
In the case Integer, we have a skopt.space.Integer,
28+
for Real we have skopt.space.Real and
29+
for Categorical we have skopt.space.Categorical
30+
"""
31+
if dict_bounds["type"] == "Integer":
32+
return Integer(low=dict_bounds["min"], high=dict_bounds["max"], name=name_dimension)
33+
elif dict_bounds["type"] == "Real":
34+
return Real(low=dict_bounds["min"], high=dict_bounds["max"], name=name_dimension)
35+
elif dict_bounds["type"] == "Categorical":
36+
return Categorical(categories=dict_bounds["categories"], name=name_dimension)
37+
38+
39+
def get_search_space(dict_config_opti_imputer: Dict) -> List[Dimension]:
40+
"""Construct the search space for the tested_model
41+
based on the dict_config_opti_imputer
42+
43+
Parameters
44+
----------
45+
dict_config_opti_imputer : Dict
46+
47+
Returns
48+
-------
49+
List[Dimension]
50+
search space
51+
52+
"""
53+
list_spaces = []
54+
55+
for name_hyperparam, value in dict_config_opti_imputer.items():
56+
# space common for all columns
57+
if "type" in value:
58+
list_spaces.append(get_dimension(value, name_hyperparam))
59+
else:
60+
for col, dict_bounds in value.items():
61+
name = f"{name_hyperparam}/{col}"
62+
list_spaces.append(get_dimension(dict_bounds, name))
63+
64+
return list_spaces
65+
66+
67+
def deflat_hyperparams(
68+
hyperparams_flat: Dict[str, Union[float, int, str]]
69+
) -> Dict[str, Union[float, int, str, Dict[str, Union[float, int, str]]]]:
70+
"""
71+
Set the hyperparameters to the model
72+
73+
Parameters
74+
----------
75+
hyperparams_flat : Dict[str, Union[int, float, str]]
76+
dictionary containing the hyperparameters and their value`
77+
78+
Return
79+
-------
80+
Dict
81+
Deflat hyperparams_flat
82+
"""
83+
84+
hyperparams: Dict[str, Any] = {}
85+
for name_dimension, hyperparam in hyperparams_flat.items():
86+
if "/" not in name_dimension:
87+
hyperparams[name_dimension] = hyperparam
88+
else:
89+
name_hyperparam, col = name_dimension.split("/")
90+
if name_hyperparam in hyperparams:
91+
hyperparams[name_hyperparam][col] = hyperparam
92+
else:
93+
new_dict: Dict[str, Union[float, int, str]] = {col: hyperparam}
94+
hyperparams[name_hyperparam] = new_dict
95+
return hyperparams
96+
97+
1598
class CrossValidation:
1699
"""
17100
This class implements a cross-validation to find the hyperparameters
18101
that minimize a reconstruction loss (L1 or L2) over mutliple subsets
19102
20103
Parameters
21104
----------
22-
model:
23-
search_space: Optional[Dict[str, Union[int, float, str]]]
105+
imputer: Any
106+
Imputer with the hyperparameters
107+
dict_config_opti_imputer: Optional[Dict[str, Union[int, float, str]]]
24108
search space for the hyperparameters
25-
hole_generator:
26-
109+
hole_generator: _HoleGenerator
110+
The generator of hole
27111
n_calls: Optional[int]
28112
number of calls. By default the value is set to 10
29113
n_jobs: Optional[int]
@@ -32,23 +116,19 @@ class CrossValidation:
32116
-1 means using all processors. By default the value is set to -1
33117
loss_norm: Optional[int]
34118
loss norm to evaluate the reconstruction. By default the value is set to 1
35-
ratio_missing: Optional[float]
36-
ratio of artificially missing data. By default the value is set to 0.1
37-
corruption: Optional[str]
38-
type of corruption: "missing" or "outlier". By default the value is set to "missing"
39119
"""
40120

41121
def __init__(
42122
self,
43123
imputer: Any,
44-
list_spaces: List[Dimension],
124+
dict_config_opti_imputer: Dict[str, Any],
45125
hole_generator: _HoleGenerator,
46126
n_calls: int = 10,
47127
n_jobs: int = -1,
48128
loss_norm: int = 1,
49129
):
50130
self.imputer = imputer
51-
self.list_spaces = list_spaces
131+
self.dict_config_opti_imputer = dict_config_opti_imputer
52132
self.hole_generator = hole_generator
53133
self.n_calls = n_calls
54134
self.n_jobs = n_jobs
@@ -89,31 +169,7 @@ def loss_function(
89169
else:
90170
raise ValueError("loss_norm has to be 0 or 1 (int)")
91171

92-
def deflat_hyperparams(
93-
self, hyperparams_flat: Dict[str, Union[float, int, str]]
94-
) -> Dict[str, Union[float, int, str, Dict[str, Union[float, int, str]]]]:
95-
"""
96-
Set the hyperparameters to the model
97-
98-
Parameters
99-
----------
100-
hyperparams_flat : Dict[str, Union[int, float, str]]
101-
dictionary containing the hyperparameters and their value
102-
"""
103-
hyperparams: Dict[str, Any] = {}
104-
for name_dimension, hyperparam in hyperparams_flat.items():
105-
if "/" not in name_dimension:
106-
hyperparams[name_dimension] = hyperparam
107-
else:
108-
name_hyperparam, col = name_dimension.split("/")
109-
if name_hyperparam in hyperparams:
110-
hyperparams[name_hyperparam][col] = hyperparam
111-
else:
112-
new_dict: Dict[str, Union[float, int, str]] = {col: hyperparam}
113-
hyperparams[name_hyperparam] = new_dict
114-
return hyperparams
115-
116-
def objective(self, X):
172+
def objective(self, df: pd.DataFrame, list_spaces: List[Dimension]) -> Callable:
117173
"""
118174
Define the objective function for the cross-validation
119175
@@ -123,17 +179,17 @@ def objective(self, X):
123179
objective function
124180
"""
125181

126-
@skopt.utils.use_named_args(self.list_spaces)
182+
@skopt.utils.use_named_args(list_spaces)
127183
def obj_func(**hyperparams_flat):
128-
self.imputer.hyperparams_optim = self.deflat_hyperparams(hyperparams_flat)
184+
self.imputer.hyperparams_optim = deflat_hyperparams(hyperparams_flat)
129185

130186
errors = []
131187

132-
for df_mask in self.hole_generator.split(X):
133-
df_origin = X.copy()
188+
for df_mask in self.hole_generator.split(df):
189+
df_origin = df.copy()
134190
df_corrupted = df_origin.copy()
135191
df_corrupted[df_mask] = np.nan
136-
cols_with_nans = X.columns[X.isna().any(axis=0)].tolist()
192+
cols_with_nans = df.columns[df.isna().any(axis=0)].tolist()
137193
imputed = self.imputer.fit_transform(df_corrupted)
138194

139195
error = self.loss_function(
@@ -148,55 +204,49 @@ def obj_func(**hyperparams_flat):
148204

149205
return obj_func
150206

151-
def fit_transform(
152-
self, df: pd.DataFrame, return_hyper_params: Optional[bool] = False
153-
) -> pd.DataFrame:
154-
"""
155-
Fit and transform estimator and impute the missing values.
207+
def optimize_hyperparams(self, df: pd.DataFrame) -> Dict[str, Union[float, int, str]]:
208+
"""Optimize hyperparamaters
156209
157210
Parameters
158211
----------
159-
X : pd.DataFrame
160-
dataframe to impute
161-
return_hyper_params : Optional[bool]
162-
by default False
212+
df : pd.DataFrame
213+
DataFrame masked
163214
164215
Returns
165216
-------
166-
pd.DataFrame
167-
imputed dataframe
217+
Dict[str, Union[float,int, str]]
218+
hyperparameters optimize flat
168219
"""
169-
170-
n0 = max(5, self.n_calls // 5)
171-
print("---")
172-
print(self.n_calls)
173-
print(n0)
174-
175-
# res = skopt.gp_minimize(
176-
# self.objective(X=df),
177-
# dimensions=self.list_spaces,
178-
# n_calls=self.n_calls,
179-
# n_initial_points=n0,
180-
# random_state=42,
181-
# n_jobs=self.n_jobs,
182-
# )
183-
220+
list_spaces = get_search_space(self.dict_config_opti_imputer)
184221
res = skopt.gp_minimize(
185-
self.objective(X=df),
186-
dimensions=self.list_spaces,
222+
self.objective(df, list_spaces),
223+
dimensions=list_spaces,
187224
n_calls=self.n_calls,
188-
n_initial_points=n0,
189-
random_state=42,
225+
n_initial_points=max(5, self.n_calls // 5),
226+
random_state=self.imputer.random_state,
190227
n_jobs=self.n_jobs,
191228
)
192229

193-
hyperparams_flat = {space.name: val for space, val in zip(self.list_spaces, res["x"])}
194-
print(f"Optimal hyperparameters : {hyperparams_flat}")
195-
print(f"Results: {res}")
230+
hyperparams_flat = {space.name: val for space, val in zip(list_spaces, res["x"])}
231+
return hyperparams_flat
232+
233+
def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
234+
"""
235+
Fit and transform estimator and impute the missing values.
236+
237+
Parameters
238+
----------
239+
df : pd.DataFrame
240+
dataframe to impute
241+
242+
Returns
243+
-------
244+
pd.DataFrame
245+
imputed dataframe
246+
"""
196247

197-
self.imputer.hyperparams_optim = self.deflat_hyperparams(hyperparams_flat)
248+
hyperparams_flat = self.optimize_hyperparams(df)
249+
self.imputer.hyperparams_optim = deflat_hyperparams(hyperparams_flat)
198250
df_imputed = self.imputer.fit_transform(df)
199251

200-
if return_hyper_params:
201-
return df_imputed, hyperparams_flat
202252
return df_imputed

0 commit comments

Comments
 (0)