Skip to content

Commit 090b4ba

Browse files
Merge pull request #57 from Quantmetry/loggers_removed
Loggers removed
2 parents cc6ef55 + 561ddbe commit 090b4ba

File tree

16 files changed

+321
-169
lines changed

16 files changed

+321
-169
lines changed

examples/RPCA.md

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,10 @@ X_true, A_true, E_true = generate_artificial_ts(n_samples, periods, amp_anomalie
4646
signal = X_true + A_true + E_true
4747

4848
# Adding missing data
49-
signal[5:20] = np.nan
49+
#signal[5:20] = np.nan
50+
mask = np.random.choice(len(signal), round(len(signal) / 20))
51+
signal[mask] = np.nan
52+
5053
```
5154

5255
```python
@@ -74,7 +77,7 @@ plt.show()
7477

7578
```python
7679
%%time
77-
rpca_pcp = RPCAPCP(period=100, max_iterations=5, mu=.5, lam=1)
80+
rpca_pcp = RPCAPCP(period=100, max_iterations=100, mu=.5, lam=0.1)
7881
X, A = rpca_pcp.decompose_rpca_signal(signal)
7982
imputed = signal - A
8083
```
@@ -89,7 +92,7 @@ plt.plot(imputed)
8992

9093
```python
9194
%%time
92-
rpca_noisy = RPCANoisy(period=10, tau=2, lam=0.3, list_periods=[10], list_etas=[0.01], norm="L2")
95+
rpca_noisy = RPCANoisy(period=10, tau=1, lam=0.4, list_periods=[10], list_etas=[0.01], norm="L2")
9396
X, A = rpca_noisy.decompose_rpca_signal(signal)
9497
```
9598

examples/benchmark.md

Lines changed: 23 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,8 @@ ratio_masked = 0.1
116116
```
117117

118118
```python
119+
dict_config_opti = {}
120+
119121
imputer_mean = imputers.ImputerMean(groups=("station",))
120122
imputer_median = imputers.ImputerMedian(groups=("station",))
121123
imputer_mode = imputers.ImputerMode(groups=("station",))
@@ -126,7 +128,19 @@ imputer_spline = imputers.ImputerInterpolation(groups=("station",), method="spli
126128
imputer_shuffle = imputers.ImputerShuffle(groups=("station",))
127129
imputer_residuals = imputers.ImputerResiduals(groups=("station",), period=365, model_tsa="additive", extrapolate_trend="freq", method_interpolation="linear")
128130

129-
imputer_rpca = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=256, tau=2, lam=1)
131+
imputer_rpca = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=500, tau=2, lam=0.05)
132+
imputer_rpca_opti = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=256)
133+
dict_config_opti["RPCA_opti"] = {
134+
"tau": ho.hp.uniform("tau", low=.5, high=5),
135+
"lam": ho.hp.uniform("lam", low=.1, high=1),
136+
}
137+
imputer_rpca_opticw = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=256)
138+
dict_config_opti["RPCA_opticw"] = {
139+
"tau/TEMP": ho.hp.uniform("tau/TEMP", low=.5, high=5),
140+
"tau/PRES": ho.hp.uniform("tau/PRES", low=.5, high=5),
141+
"lam/TEMP": ho.hp.uniform("lam/TEMP", low=.1, high=1),
142+
"lam/PRES": ho.hp.uniform("lam/PRES", low=.1, high=1),
143+
}
130144

131145
imputer_ou = imputers.ImputerEM(groups=("station",), model="multinormal", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
132146
imputer_tsou = imputers.ImputerEM(groups=("station",), model="VAR1", method="sample", max_iter_em=34, n_iter_ou=15, dt=1e-3)
@@ -142,41 +156,6 @@ imputer_regressor = imputers.ImputerRegressor(groups=("station",), estimator=Lin
142156
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=2, groups=("station",), subset=cols_to_impute, ratio_masked=ratio_masked)
143157
```
144158

145-
```python
146-
dict_config_opti = {
147-
"tau": ho.hp.uniform("tau", low=.5, high=5),
148-
"lam": ho.hp.uniform("lam", low=.1, high=1),
149-
}
150-
imputer_rpca_opti = imputers.ImputerRPCA(groups=("station",), columnwise=False, max_iterations=256)
151-
imputer_rpca_opti = hyperparameters.optimize(
152-
imputer_rpca_opti,
153-
df_data,
154-
generator = generator_holes,
155-
metric="mae",
156-
max_evals=10,
157-
dict_spaces=dict_config_opti
158-
)
159-
# imputer_rpca_opti.params_optim = hyperparams_opti
160-
```
161-
162-
```python
163-
dict_config_opti2 = {
164-
"tau/TEMP": ho.hp.uniform("tau/TEMP", low=.5, high=5),
165-
"tau/PRES": ho.hp.uniform("tau/PRES", low=.5, high=5),
166-
"lam/TEMP": ho.hp.uniform("lam/TEMP", low=.1, high=1),
167-
"lam/PRES": ho.hp.uniform("lam/PRES", low=.1, high=1),
168-
}
169-
imputer_rpca_opti2 = imputers.ImputerRPCA(groups=("station",), columnwise=True, max_iterations=256)
170-
imputer_rpca_opti2 = hyperparameters.optimize(
171-
imputer_rpca_opti2,
172-
df_data,
173-
generator = generator_holes,
174-
metric="mae",
175-
max_evals=10,
176-
dict_spaces=dict_config_opti2
177-
)
178-
```
179-
180159
```python
181160
dict_imputers = {
182161
"mean": imputer_mean,
@@ -189,9 +168,9 @@ dict_imputers = {
189168
# "OU": imputer_ou,
190169
"TSOU": imputer_tsou,
191170
"TSMLE": imputer_tsmle,
192-
# "RPCA": imputer_rpca,
193-
# "RPCA_opti": imputer_rpca_opti,
194-
# "RPCA_opti2": imputer_rpca_opti2,
171+
"RPCA": imputer_rpca,
172+
"RPCA_opti": imputer_rpca_opti,
173+
# "RPCA_opticw": imputer_rpca_opti2,
195174
# "locf": imputer_locf,
196175
# "nocb": imputer_nocb,
197176
# "knn": imputer_knn,
@@ -218,7 +197,7 @@ comparison = comparator.Comparator(
218197
dict_imputers,
219198
cols_to_impute,
220199
generator_holes = generator_holes,
221-
metrics=["mae", "wmape", "KL_columnwise", "ks_test"],
200+
metrics=["mae", "wmape", "KL_columnwise", "ks_test", "dist_corr_pattern"],
222201
max_evals=10,
223202
dict_config_opti=dict_config_opti,
224203
)
@@ -230,11 +209,13 @@ results
230209
df_plot = results.loc["KL_columnwise",'TEMP']
231210
plt.barh(df_plot.index, df_plot, color=tab10(0))
232211
plt.title('TEMP')
212+
plt.xlabel("KL")
233213
plt.show()
234214

235215
df_plot = results.loc["KL_columnwise",'PRES']
236216
plt.barh(df_plot.index, df_plot, color=tab10(0))
237217
plt.title('PRES')
218+
plt.xlabel("KL")
238219
plt.show()
239220
```
240221

@@ -245,8 +226,8 @@ plot.multibar(results.loc["mae"], decimals=1)
245226
plt.ylabel("mae")
246227

247228
fig.add_subplot(2, 1, 2)
248-
plot.multibar(results.loc["KL_columnwise"], decimals=1)
249-
plt.ylabel("KL")
229+
plot.multibar(results.loc["dist_corr_pattern"], decimals=2)
230+
plt.ylabel("dist_corr_pattern")
250231

251232
plt.savefig("figures/imputations_benchmark_errors.png")
252233
plt.show()
@@ -294,10 +275,6 @@ for col in cols_to_impute:
294275

295276
```
296277

297-
```python
298-
dfs_imputed
299-
```
300-
301278
```python
302279
# plot.plot_imputations(df_station, dfs_imputed_station)
303280

qolmat/benchmark/comparator.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,15 @@ def __init__(
3535
metrics: List = ["mae", "wmape", "KL_columnwise"],
3636
dict_config_opti: Optional[Dict[str, Any]] = {},
3737
max_evals: int = 10,
38+
verbose: bool = False,
3839
):
3940
self.dict_imputers = dict_models
4041
self.selected_columns = selected_columns
4142
self.generator_holes = generator_holes
4243
self.metrics = metrics
4344
self.dict_config_opti = dict_config_opti
4445
self.max_evals = max_evals
46+
self.verbose = verbose
4547

4648
def get_errors(
4749
self,
@@ -106,6 +108,7 @@ def evaluate_errors_sample(
106108
metric_optim,
107109
dict_config_opti_imputer,
108110
max_evals=self.max_evals,
111+
verbose=self.verbose,
109112
)
110113
df_imputed = imputer_opti.fit_transform(df_corrupted)
111114
subset = self.generator_holes.subset
Lines changed: 61 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import copy
2-
import logging
32
from typing import Any, Callable, Dict, List, Union
43

54
import numpy as np
@@ -12,21 +11,38 @@
1211
from qolmat.benchmark import metrics
1312

1413
from qolmat.benchmark.missing_patterns import _HoleGenerator
14+
from qolmat.imputations.imputers import _Imputer
15+
from qolmat.utils.utils import HyperValue
1516

16-
logger = logging.getLogger(__name__)
17-
logger.setLevel(logging.DEBUG)
1817

19-
HyperValue = Union[int, float, str]
20-
21-
22-
def get_objective(imputer, df, generator, metric, names_hyperparams) -> Callable:
18+
def get_objective(
19+
imputer: _Imputer,
20+
df: pd.DataFrame,
21+
generator: _HoleGenerator,
22+
metric: str,
23+
names_hyperparams: List[str],
24+
) -> Callable:
2325
"""
24-
Define the objective function for the cross-validation
26+
Define the objective function, which is the average metric computed over the folds provided by
27+
the hole generator, using a cross-validation.
28+
29+
Parameters
30+
----------
31+
imputer: _Imputer
32+
Imputer that should be optimized, it should at least have a fit_transform method and an
33+
imputer_params attribute
34+
generator: _HoleGenerator
35+
Generator creating the masked values in the nested cross validation allowing to measure the
36+
imputer performance
37+
metric: str
38+
Metric used as perfomance indicator, common values are `mse` and `mae`
39+
names_hyperparams: List[str]
40+
List of the names of the hyperparameters which are being optimized
2541
2642
Returns
2743
-------
28-
_type_
29-
objective function
44+
Callable[List[HyperValue], float]
45+
Objective function
3046
"""
3147

3248
def fun_obf(args: List[HyperValue]) -> float:
@@ -39,7 +55,6 @@ def fun_obf(args: List[HyperValue]) -> float:
3955
df_origin = df.copy()
4056
df_corrupted = df_origin.copy()
4157
df_corrupted[df_mask] = np.nan
42-
4358
df_imputed = imputer.fit_transform(df_corrupted)
4459
subset = generator.subset
4560
fun_metric = metrics.get_metric(metric)
@@ -52,32 +67,55 @@ def fun_obf(args: List[HyperValue]) -> float:
5267
return fun_obf
5368

5469

55-
def optimize(imputer, df, generator, metric, dict_spaces, max_evals=100):
56-
"""Optimize hyperparamaters
70+
def optimize(
71+
imputer: _Imputer,
72+
df: pd.DataFrame,
73+
generator: _HoleGenerator,
74+
metric: str,
75+
dict_config: Dict[str, HyperValue],
76+
max_evals: int = 100,
77+
verbose: bool = False,
78+
):
79+
"""Return the provided imputer with hyperparameters optimized in the provided range in order to
80+
minimize the provided metric.
5781
5882
Parameters
5983
----------
60-
df : pd.DataFrame
61-
DataFrame masked
84+
imputer: _Imputer
85+
Imputer that should be optimized, it should at least have a fit_transform method and an
86+
imputer_params attribute
87+
generator: _HoleGenerator
88+
Generator creating the masked values in the nested cross validation allowing to measure the
89+
imputer performance
90+
metric: str
91+
Metric used as perfomance indicator, common values are `mse` and `mae`
92+
dict_config: Dict[str, HyperValue]
93+
Search space for the tested hyperparameters
94+
max_evals: int
95+
Maximum number of evaluation of the performance of the algorithm. Each estimation involves
96+
one call to fit_transform per fold returned by the generator. See the n_fold attribute.
97+
verbose: bool
98+
Verbosity switch, usefull for imputers that can have unstable behavior for some
99+
hyperparameters values
62100
63101
Returns
64102
-------
65-
Dict[str, Any]
66-
hyperparameters optimize flat
103+
_Imputer
104+
Optimized imputer
67105
"""
68106
imputer = copy.deepcopy(imputer)
69-
if dict_spaces == {}:
107+
if dict_config == {}:
70108
return imputer
71-
names_hyperparams = list(dict_spaces.keys())
72-
values_hyperparams = list(dict_spaces.values())
73-
imputer.imputer_params = tuple(set(imputer.imputer_params) | set(dict_spaces.keys()))
109+
names_hyperparams = list(dict_config.keys())
110+
values_hyperparams = list(dict_config.values())
111+
imputer.imputer_params = tuple(set(imputer.imputer_params) | set(dict_config.keys()))
112+
if verbose and hasattr(imputer, "verbose"):
113+
setattr(imputer, "verbose", False)
74114
fun_obj = get_objective(imputer, df, generator, metric, names_hyperparams)
75115
hyperparams = ho.fmin(
76116
fn=fun_obj, space=values_hyperparams, algo=ho.tpe.suggest, max_evals=max_evals
77117
)
78118

79-
# hyperparams = deflat_hyperparams(hyperparams_flat)
80119
for key, value in hyperparams.items():
81120
setattr(imputer, key, value)
82-
# imputer.hyperparams = hyperparams
83121
return imputer

qolmat/benchmark/metrics.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -919,6 +919,7 @@ def get_metric(name: str) -> Callable:
919919
"wasserstein_columnwise": partial(wasserstein_distance, method="columnwise"),
920920
"KL_columnwise": partial(kl_divergence, method="columnwise"),
921921
"KL_gaussian": partial(kl_divergence, method="gaussian"),
922+
"KL_forest": partial(kl_divergence, method="random_forest"),
922923
"ks_test": kolmogorov_smirnov_test,
923924
"correlation_diff": mean_difference_correlation_matrix_numerical_features,
924925
"pairwise_dist": sum_pairwise_distances,

qolmat/benchmark/missing_patterns.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
from __future__ import annotations
22

33
import functools
4-
import logging
54
from typing import Callable, List, Optional, Tuple, Union
5+
import warnings
66

77
import numpy as np
88
import pandas as pd
@@ -11,8 +11,6 @@
1111

1212
from qolmat.utils.exceptions import NoMissingValue, SubsetIsAString
1313

14-
logger = logging.getLogger(__name__)
15-
1614

1715
def compute_transition_counts_matrix(states: pd.Series):
1816
if isinstance(states.iloc[0], tuple):
@@ -305,7 +303,7 @@ def generate_mask(self, X: pd.DataFrame) -> pd.DataFrame:
305303
break
306304

307305
if list_failed:
308-
logger.warning(f"No place to introduce sampled holes of size {list_failed}!")
306+
warnings.warn(f"No place to introduce sampled holes of size {list_failed}!")
309307
return mask
310308

311309

0 commit comments

Comments
 (0)