11import logging
2- from typing import Any , Dict , List , Optional , Union
3-
2+ from typing import Any , Callable , Dict , List , Union
3+ from skopt . space import Categorical , Dimension , Integer , Real
44import numpy as np
55import pandas as pd
66import skopt
7- from skopt .space import Dimension
87
98from qolmat .benchmark .missing_patterns import _HoleGenerator
109
1110logger = logging .getLogger (__name__ )
1211logger .setLevel (logging .DEBUG )
1312
1413
14+ def get_dimension (dict_bounds : Dict , name_dimension : str ) -> Dimension :
15+ """Get the dimension of hyperparamaters with skopt
16+
17+ Parameters
18+ ----------
19+ dict_bounds : Dict
20+ Dictionnay of bounds of hyperparameters
21+ name_dimension : str
22+ Name of hyperparameters
23+
24+ Returns
25+ -------
26+ Dimension
27+ In the case Integer, we have a skopt.space.Integer,
28+ for Real we have skopt.space.Real and
29+ for Categorical we have skopt.space.Categorical
30+ """
31+ if dict_bounds ["type" ] == "Integer" :
32+ return Integer (low = dict_bounds ["min" ], high = dict_bounds ["max" ], name = name_dimension )
33+ elif dict_bounds ["type" ] == "Real" :
34+ return Real (low = dict_bounds ["min" ], high = dict_bounds ["max" ], name = name_dimension )
35+ elif dict_bounds ["type" ] == "Categorical" :
36+ return Categorical (categories = dict_bounds ["categories" ], name = name_dimension )
37+
38+
39+ def get_search_space (dict_config_opti_imputer : Dict ) -> List [Dimension ]:
40+ """Construct the search space for the tested_model
41+ based on the dict_config_opti_imputer
42+
43+ Parameters
44+ ----------
45+ dict_config_opti_imputer : Dict
46+
47+ Returns
48+ -------
49+ List[Dimension]
50+ search space
51+
52+ """
53+ list_spaces = []
54+
55+ for name_hyperparam , value in dict_config_opti_imputer .items ():
56+ # space common for all columns
57+ if "type" in value :
58+ list_spaces .append (get_dimension (value , name_hyperparam ))
59+ else :
60+ for col , dict_bounds in value .items ():
61+ name = f"{ name_hyperparam } /{ col } "
62+ list_spaces .append (get_dimension (dict_bounds , name ))
63+
64+ return list_spaces
65+
66+
67+ def deflat_hyperparams (
68+ hyperparams_flat : Dict [str , Union [float , int , str ]]
69+ ) -> Dict [str , Union [float , int , str , Dict [str , Union [float , int , str ]]]]:
70+ """
71+ Set the hyperparameters to the model
72+
73+ Parameters
74+ ----------
75+ hyperparams_flat : Dict[str, Union[int, float, str]]
76+ dictionary containing the hyperparameters and their value`
77+
78+ Return
79+ -------
80+ Dict
81+ Deflat hyperparams_flat
82+ """
83+
84+ hyperparams : Dict [str , Any ] = {}
85+ for name_dimension , hyperparam in hyperparams_flat .items ():
86+ if "/" not in name_dimension :
87+ hyperparams [name_dimension ] = hyperparam
88+ else :
89+ name_hyperparam , col = name_dimension .split ("/" )
90+ if name_hyperparam in hyperparams :
91+ hyperparams [name_hyperparam ][col ] = hyperparam
92+ else :
93+ new_dict : Dict [str , Union [float , int , str ]] = {col : hyperparam }
94+ hyperparams [name_hyperparam ] = new_dict
95+ return hyperparams
96+
97+
1598class CrossValidation :
1699 """
17100 This class implements a cross-validation to find the hyperparameters
18101 that minimize a reconstruction loss (L1 or L2) over mutliple subsets
19102
20103 Parameters
21104 ----------
22- model:
23- search_space: Optional[Dict[str, Union[int, float, str]]]
105+ imputer: Any
106+ Imputer with the hyperparameters
107+ dict_config_opti_imputer: Optional[Dict[str, Union[int, float, str]]]
24108 search space for the hyperparameters
25- hole_generator:
26-
109+ hole_generator: _HoleGenerator
110+ The generator of hole
27111 n_calls: Optional[int]
28112 number of calls. By default the value is set to 10
29113 n_jobs: Optional[int]
@@ -32,23 +116,19 @@ class CrossValidation:
32116 -1 means using all processors. By default the value is set to -1
33117 loss_norm: Optional[int]
34118 loss norm to evaluate the reconstruction. By default the value is set to 1
35- ratio_missing: Optional[float]
36- ratio of artificially missing data. By default the value is set to 0.1
37- corruption: Optional[str]
38- type of corruption: "missing" or "outlier". By default the value is set to "missing"
39119 """
40120
41121 def __init__ (
42122 self ,
43123 imputer : Any ,
44- list_spaces : List [ Dimension ],
124+ dict_config_opti_imputer : Dict [ str , Any ],
45125 hole_generator : _HoleGenerator ,
46126 n_calls : int = 10 ,
47127 n_jobs : int = - 1 ,
48128 loss_norm : int = 1 ,
49129 ):
50130 self .imputer = imputer
51- self .list_spaces = list_spaces
131+ self .dict_config_opti_imputer = dict_config_opti_imputer
52132 self .hole_generator = hole_generator
53133 self .n_calls = n_calls
54134 self .n_jobs = n_jobs
@@ -89,31 +169,7 @@ def loss_function(
89169 else :
90170 raise ValueError ("loss_norm has to be 0 or 1 (int)" )
91171
92- def deflat_hyperparams (
93- self , hyperparams_flat : Dict [str , Union [float , int , str ]]
94- ) -> Dict [str , Union [float , int , str , Dict [str , Union [float , int , str ]]]]:
95- """
96- Set the hyperparameters to the model
97-
98- Parameters
99- ----------
100- hyperparams_flat : Dict[str, Union[int, float, str]]
101- dictionary containing the hyperparameters and their value
102- """
103- hyperparams : Dict [str , Any ] = {}
104- for name_dimension , hyperparam in hyperparams_flat .items ():
105- if "/" not in name_dimension :
106- hyperparams [name_dimension ] = hyperparam
107- else :
108- name_hyperparam , col = name_dimension .split ("/" )
109- if name_hyperparam in hyperparams :
110- hyperparams [name_hyperparam ][col ] = hyperparam
111- else :
112- new_dict : Dict [str , Union [float , int , str ]] = {col : hyperparam }
113- hyperparams [name_hyperparam ] = new_dict
114- return hyperparams
115-
116- def objective (self , X ):
172+ def objective (self , df : pd .DataFrame , list_spaces : List [Dimension ]) -> Callable :
117173 """
118174 Define the objective function for the cross-validation
119175
@@ -123,17 +179,17 @@ def objective(self, X):
123179 objective function
124180 """
125181
126- @skopt .utils .use_named_args (self . list_spaces )
182+ @skopt .utils .use_named_args (list_spaces )
127183 def obj_func (** hyperparams_flat ):
128- self .imputer .hyperparams_optim = self . deflat_hyperparams (hyperparams_flat )
184+ self .imputer .hyperparams_optim = deflat_hyperparams (hyperparams_flat )
129185
130186 errors = []
131187
132- for df_mask in self .hole_generator .split (X ):
133- df_origin = X .copy ()
188+ for df_mask in self .hole_generator .split (df ):
189+ df_origin = df .copy ()
134190 df_corrupted = df_origin .copy ()
135191 df_corrupted [df_mask ] = np .nan
136- cols_with_nans = X .columns [X .isna ().any (axis = 0 )].tolist ()
192+ cols_with_nans = df .columns [df .isna ().any (axis = 0 )].tolist ()
137193 imputed = self .imputer .fit_transform (df_corrupted )
138194
139195 error = self .loss_function (
@@ -148,55 +204,49 @@ def obj_func(**hyperparams_flat):
148204
149205 return obj_func
150206
151- def fit_transform (
152- self , df : pd .DataFrame , return_hyper_params : Optional [bool ] = False
153- ) -> pd .DataFrame :
154- """
155- Fit and transform estimator and impute the missing values.
207+ def optimize_hyperparams (self , df : pd .DataFrame ) -> Dict [str , Union [float , int , str ]]:
208+ """Optimize hyperparamaters
156209
157210 Parameters
158211 ----------
159- X : pd.DataFrame
160- dataframe to impute
161- return_hyper_params : Optional[bool]
162- by default False
212+ df : pd.DataFrame
213+ DataFrame masked
163214
164215 Returns
165216 -------
166- pd.DataFrame
167- imputed dataframe
217+ Dict[str, Union[float,int, str]]
218+ hyperparameters optimize flat
168219 """
169-
170- n0 = max (5 , self .n_calls // 5 )
171- print ("---" )
172- print (self .n_calls )
173- print (n0 )
174-
175- # res = skopt.gp_minimize(
176- # self.objective(X=df),
177- # dimensions=self.list_spaces,
178- # n_calls=self.n_calls,
179- # n_initial_points=n0,
180- # random_state=42,
181- # n_jobs=self.n_jobs,
182- # )
183-
220+ list_spaces = get_search_space (self .dict_config_opti_imputer )
184221 res = skopt .gp_minimize (
185- self .objective (X = df ),
186- dimensions = self . list_spaces ,
222+ self .objective (df , list_spaces ),
223+ dimensions = list_spaces ,
187224 n_calls = self .n_calls ,
188- n_initial_points = n0 ,
189- random_state = 42 ,
225+ n_initial_points = max ( 5 , self . n_calls // 5 ) ,
226+ random_state = self . imputer . random_state ,
190227 n_jobs = self .n_jobs ,
191228 )
192229
193- hyperparams_flat = {space .name : val for space , val in zip (self .list_spaces , res ["x" ])}
194- print (f"Optimal hyperparameters : { hyperparams_flat } " )
195- print (f"Results: { res } " )
230+ hyperparams_flat = {space .name : val for space , val in zip (list_spaces , res ["x" ])}
231+ return hyperparams_flat
232+
233+ def fit_transform (self , df : pd .DataFrame ) -> pd .DataFrame :
234+ """
235+ Fit and transform estimator and impute the missing values.
236+
237+ Parameters
238+ ----------
239+ df : pd.DataFrame
240+ dataframe to impute
241+
242+ Returns
243+ -------
244+ pd.DataFrame
245+ imputed dataframe
246+ """
196247
197- self .imputer .hyperparams_optim = self .deflat_hyperparams (hyperparams_flat )
248+ hyperparams_flat = self .optimize_hyperparams (df )
249+ self .imputer .hyperparams_optim = deflat_hyperparams (hyperparams_flat )
198250 df_imputed = self .imputer .fit_transform (df )
199251
200- if return_hyper_params :
201- return df_imputed , hyperparams_flat
202252 return df_imputed
0 commit comments