Skip to content

Commit ba0496d

Browse files
author
Gsaes
committed
Correction MR
1 parent b0cf303 commit ba0496d

File tree

7 files changed

+80
-70
lines changed

7 files changed

+80
-70
lines changed

qolmat/benchmark/cross_validation.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ def get_dimension(dict_bounds: Dict, name_dimension: str) -> Dimension:
3434
return Real(low=dict_bounds["min"], high=dict_bounds["max"], name=name_dimension)
3535
elif dict_bounds["type"] == "Categorical":
3636
return Categorical(categories=dict_bounds["categories"], name=name_dimension)
37+
else:
38+
ValueError("The 'type' must be 'Integer', 'Real' or 'Categorical")
3739

3840

3941
def get_search_space(dict_config_opti_imputer: Dict) -> List[Dimension]:

qolmat/imputations/imputers.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ def fit_transform(self, df: pd.DataFrame) -> pd.DataFrame:
103103
df_imputed[col] = self.impute_element(df[[col]])
104104

105105
else:
106+
if any(isinstance(value, dict) for value in hyperparams.values()):
107+
raise AssertionError("hyperparams contains a dictionary. Columnwise must be True.")
106108
self.hyperparams_element = hyperparams
107109
df_imputed = self.impute_element(df)
108110

qolmat/imputations/rpca/rpca.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,12 @@ def __init__(
3333
period: Optional[int] = None,
3434
max_iter: int = int(1e4),
3535
tol: float = 1e-6,
36+
random_state: Union[None, int, np.random.RandomState] = None,
3637
) -> None:
3738
self.n_rows = period
3839
self.max_iter = max_iter
3940
self.tol = tol
41+
self.random_state = random_state
4042

4143
def _prepare_data(self, X: NDArray) -> NDArray:
4244
"""

qolmat/imputations/rpca/rpca_noisy.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,6 @@ def __init__(
5858
max_iter: int = int(1e4),
5959
tol: float = 1e-6,
6060
norm: Optional[str] = "L2",
61-
random_state: Union[None, int, np.random.RandomState] = None,
6261
) -> None:
6362
super().__init__(period=period, max_iter=max_iter, tol=tol)
6463
self.rank = rank
@@ -67,7 +66,6 @@ def __init__(
6766
self.list_periods = list_periods
6867
self.list_etas = list_etas
6968
self.norm = norm
70-
self.random_state = random_state
7169

7270
def decompose_rpca_L1(
7371
self, D: NDArray, Omega: NDArray, lam: float, tau: float, rank: int

qolmat/imputations/rpca/rpca_pcp.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,6 @@ def __init__(
3333
lam: Optional[float] = None,
3434
max_iter: int = int(1e4),
3535
tol: float = 1e-6,
36-
random_state: Union[None, int, np.random.RandomState] = None,
3736
) -> None:
3837
super().__init__(
3938
period=period,
@@ -42,7 +41,6 @@ def __init__(
4241
)
4342
self.mu = mu
4443
self.lam = lam
45-
self.random_state = random_state
4644

4745
def get_params_scale(self, D: NDArray):
4846
mu = D.size / (4.0 * utils.l1_norm(D))

tests/benchmark/test_comparator.py

Lines changed: 26 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -7,44 +7,36 @@
77
from qolmat.benchmark.missing_patterns import EmpiricalHoleGenerator
88

99
df_origin = pd.DataFrame({"col1": [0, np.nan, 2, 4, np.nan], "col2": [-1, np.nan, 0.5, 1, 1.5]})
10-
1110
df_imputed = pd.DataFrame({"col1": [0, 1, 2, 3.5, 4], "col2": [-1.5, 0, 1.5, 2, 1.5]})
12-
1311
df_mask = pd.DataFrame(
1412
{"col1": [False, False, True, True, False], "col2": [True, False, True, True, False]}
1513
)
1614

1715
cols_to_impute = ["col1", "col2"]
1816
generator_holes = EmpiricalHoleGenerator(n_splits=1, ratio_masked=0.5)
19-
dict_imputers_median = {"median": ImputerMedian()}
20-
dict_imputers_rpca = {"rpca": ImputerRPCA(max_iter=100, tau=2)}
21-
search_params = {"rpca": {"lam": {"min": 0.1, "max": 1, "type": "Real"}}}
22-
23-
comparison_median = comparator.Comparator(
24-
dict_models=dict_imputers_median,
25-
selected_columns=cols_to_impute,
26-
generator_holes=generator_holes,
27-
)
17+
dict_imputers = {"rpca": ImputerRPCA(max_iter=100, tau=2)}
18+
dict_config_opti = {"rpca": {"lam": {"min": 0.1, "max": 1, "type": "Real"}}}
2819

2920
comparison_rpca = comparator.Comparator(
30-
dict_models=dict_imputers_rpca,
21+
dict_models=dict_imputers,
3122
selected_columns=cols_to_impute,
3223
generator_holes=generator_holes,
33-
dict_config_opti=search_params,
24+
dict_config_opti=dict_config_opti,
3425
)
3526

3627
comparison_bug = comparator.Comparator(
37-
dict_models=dict_imputers_median,
28+
dict_models=dict_imputers,
3829
selected_columns=["bug"],
3930
generator_holes=generator_holes,
40-
dict_config_opti=search_params,
31+
dict_config_opti=dict_config_opti,
4132
)
4233

43-
result_expected_median = [3.0, 0.5, 0.75, 0.5, 37.88948, 39.68123]
44-
result_expected_rpca = [3.0, 0.5, 0.75, 0.5, 37.88948, 39.68123]
45-
46-
comparison_dict = {"median": comparison_median, "rpca": comparison_rpca, "bug": comparison_bug}
47-
result_expected_dict = {"median": result_expected_median, "rpca": result_expected_rpca}
34+
dict_comparison = {"rpca": comparison_rpca, "bug": comparison_bug}
35+
index_tuples_expected = pd.MultiIndex.from_product(
36+
[["mae", "wmape", "KL_columnwise"], ["col1", "col2"]]
37+
)
38+
data_expected = [3.0, 0.5, 0.75, 0.5, 37.88948, 39.68123]
39+
result_expected = pd.Series(data_expected, index=index_tuples_expected)
4840

4941

5042
@pytest.mark.parametrize("df1", [df_origin])
@@ -53,31 +45,29 @@
5345
def test_benchmark_comparator_get_errors(
5446
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
5547
) -> None:
56-
result_comparison = comparison_median.get_errors(
57-
df_origin=df1, df_imputed=df2, df_mask=df_mask
48+
result = comparison_rpca.get_errors(df_origin=df1, df_imputed=df2, df_mask=df_mask)
49+
index_tuples_expected = pd.MultiIndex.from_product(
50+
[["mae", "wmape", "KL_columnwise"], ["col1", "col2"]]
51+
)
52+
result_expected = pd.Series(
53+
[0.25, 0.83333, 0.0625, 1.16666, 18.80089, 36.63671], index=index_tuples_expected
5854
)
59-
result = list(result_comparison.values)
60-
result_expected = [0.25, 0.83333, 0.0625, 1.16666, 18.80089, 36.63671]
6155
np.testing.assert_allclose(result, result_expected, atol=1e-5)
6256

6357

6458
@pytest.mark.parametrize("df1", [df_origin])
6559
def test_benchmark_comparator_evaluate_errors_sample(df1: pd.DataFrame) -> None:
66-
result_comparison = comparison_median.evaluate_errors_sample(
67-
dict_imputers_median["median"], df1
68-
)
69-
result = comparison_rpca.evaluate_errors_sample(dict_imputers_rpca["rpca"], df1)
70-
result = list(result_comparison.values)
71-
np.testing.assert_allclose(result, result_expected_median, atol=1e-5)
60+
result = comparison_rpca.evaluate_errors_sample(dict_imputers["rpca"], df1)
61+
np.testing.assert_allclose(result, result_expected, atol=1e-5)
7262

7363

7464
@pytest.mark.parametrize("df1", [df_origin])
75-
@pytest.mark.parametrize("imputer", ["median", "rpca", "bug"])
65+
@pytest.mark.parametrize("imputer", ["rpca", "bug"])
7666
def test_benchmark_comparator_compare(df1: pd.DataFrame, imputer: str) -> None:
77-
comparison = comparison_dict[imputer]
67+
comparison = dict_comparison[imputer]
7868
if imputer == "bug":
79-
np.testing.assert_raises(Exception, comparison.compare, df1)
69+
np.testing.assert_raises(Exception, comparison.compare, df_origin)
8070
else:
81-
result_comparison = comparison.compare(df1)
82-
result = list(result_comparison.values.flatten())
83-
np.testing.assert_allclose(result, result_expected_dict[imputer], atol=1e-5)
71+
result = comparison.compare(df_origin)
72+
result_expected_DataFrame = pd.DataFrame(result_expected)
73+
np.testing.assert_allclose(result, result_expected_DataFrame, atol=1e-5)

tests/benchmark/test_cross_validation.py

Lines changed: 48 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -10,61 +10,75 @@
1010
df_origin = pd.DataFrame({"col1": [0, np.nan, 2, 4, np.nan], "col2": [-1, np.nan, 0.5, 1, 1.5]})
1111
df_imputed = pd.DataFrame({"col1": [0, 1, 2, 3.5, 4], "col2": [-1.5, 0, 1.5, 2, 1.5]})
1212
df_mask = pd.DataFrame(
13-
{"col1": [False, False, True, True, False], "col2": [True, False, True, True, False]}
13+
{"col1": [False, False, True, False, False], "col2": [True, False, True, True, False]}
1414
)
1515
df_corrupted = df_origin.copy()
1616
df_corrupted[df_mask] = np.nan
1717

18-
imputer_rpca = ImputerRPCA(tau=2, random_state=42)
18+
imputer_rpca = ImputerRPCA(tau=2, random_state=42, columnwise=True, period=1)
1919
dict_imputers_rpca = {"rpca": imputer_rpca}
2020
generator_holes = EmpiricalHoleGenerator(n_splits=1, ratio_masked=0.5)
2121
dict_config_opti = {
2222
"rpca": {
23-
"lam": {"min": 0.1, "max": 1, "type": "Real"},
23+
"lam": {
24+
"col1": {"min": 0.1, "max": 6, "type": "Real"},
25+
"col2": {"min": 1, "max": 4, "type": "Real"},
26+
},
27+
"tol": {"min": 1e-6, "max": 0.1, "type": "Real"},
2428
"max_iter": {"min": 99, "max": 100, "type": "Integer"},
2529
"norm": {"categories": ["L1", "L2"], "type": "Categorical"},
2630
}
2731
}
28-
dict_config_opti_imputer = dict_config_opti.get("rpca", {})
29-
hyperparams_flat = {"lam": 0.93382, "max_iter": 100, "norm": "L1"}
32+
dict_config_opti_imputer = dict_config_opti["rpca"]
33+
hyperparams_flat = {"lam/col1": 4.7, "lam/col2": 1.5, "tol": 0.07, "max_iter": 100, "norm": "L1"}
3034

3135
cv = cross_validation.CrossValidation(
3236
imputer=imputer_rpca,
3337
dict_config_opti_imputer=dict_config_opti_imputer,
3438
hole_generator=generator_holes,
3539
)
3640

37-
result_params_expected = {"lam": (0.1, 1), "max_iter": (99, 100), "norm": ("L1", "L2")}
41+
result_params_expected = {
42+
"lam1": (0.1, 6),
43+
"lam2": (1, 4),
44+
"tol": (1e-6, 0.1),
45+
"max_iter": (99, 100),
46+
"norm": ("L1", "L2"),
47+
}
3848

3949

40-
@pytest.mark.parametrize("dict_bounds", [dict_config_opti_imputer])
41-
@pytest.mark.parametrize("param", ["lam", "max_iter", "norm"])
42-
def test_benchmark_cross_validation_get_dimension(dict_bounds: Dict, param: str) -> None:
43-
result = cross_validation.get_dimension(dict_bounds=dict_bounds[param], name_dimension=param)
50+
@pytest.mark.parametrize("dict_config_opti_imputer", [dict_config_opti_imputer])
51+
@pytest.mark.parametrize("param", ["tol", "max_iter", "norm"])
52+
def test_benchmark_cross_validation_get_dimension(
53+
dict_config_opti_imputer: Dict, param: str
54+
) -> None:
55+
result = cross_validation.get_dimension(
56+
dict_bounds=dict_config_opti_imputer[param], name_dimension=param
57+
)
4458
result_expected = result_params_expected[param]
45-
np.testing.assert_equal(result.bounds, result_expected)
59+
assert result.bounds == result_expected
4660

4761

4862
@pytest.mark.parametrize("dict_config_opti_imputer", [dict_config_opti_imputer])
4963
def test_benchmark_cross_validation_get_search_space(dict_config_opti_imputer: Dict) -> None:
5064
list_result = cross_validation.get_search_space(dict_config_opti_imputer)
51-
result_expected = [
52-
result_params_expected["lam"],
53-
result_params_expected["max_iter"],
54-
result_params_expected["norm"],
55-
]
56-
for i in range(3):
57-
np.testing.assert_equal(list_result[i].bounds, result_expected[i])
65+
list_expected_bounds = list(result_params_expected.values())
66+
for result, expected_bounds in zip(list_result, list_expected_bounds):
67+
assert result.bounds == expected_bounds
5868

5969

6070
@pytest.mark.parametrize("hyperparams_flat", [hyperparams_flat])
6171
def test_benchmark_cross_validation_deflat_hyperparams(
6272
hyperparams_flat: Dict[str, Union[float, int, str]]
6373
) -> None:
64-
resul_deflat = cross_validation.deflat_hyperparams(hyperparams_flat=hyperparams_flat)
65-
result = list(resul_deflat.values())
66-
result_expected = [0.93382, 100, "L1"]
67-
np.testing.assert_equal(result, result_expected)
74+
result_deflat = cross_validation.deflat_hyperparams(hyperparams_flat=hyperparams_flat)
75+
result_expected = {
76+
"lam": {"col1": 4.7, "col2": 1.5},
77+
"tol": 0.07,
78+
"max_iter": 100,
79+
"norm": "L1",
80+
}
81+
assert result_deflat == result_expected
6882

6983

7084
@pytest.mark.parametrize("df1", [df_origin])
@@ -78,23 +92,27 @@ def test_benchmark_cross_validation_loss_function(
7892
np.testing.assert_raises(ValueError, cv.loss_function, df1, df2, df_mask)
7993
cv.loss_norm = 2
8094
result_cv2 = cv.loss_function(df_origin=df1, df_imputed=df2, df_mask=df_mask)
81-
np.testing.assert_allclose(result_cv2, 1.58113, atol=1e-5)
95+
np.testing.assert_allclose(result_cv2, 1.5, atol=1e-5)
8296
cv.loss_norm = 1
8397
result_cv1 = cv.loss_function(df_origin=df1, df_imputed=df2, df_mask=df_mask)
84-
np.testing.assert_allclose(result_cv1, 3, atol=1e-5)
98+
np.testing.assert_allclose(result_cv1, 2.5, atol=1e-5)
8599

86100

87101
@pytest.mark.parametrize("df", [df_corrupted])
88102
def test_benchmark_cross_validation_optimize_hyperparams(df: pd.DataFrame) -> None:
89103
result_hp = cv.optimize_hyperparams(df)
90-
result = list(result_hp.values())
91-
result_expected = [0.8168886881742098, 99, "L2"]
92-
np.testing.assert_equal(result, result_expected)
104+
result_expected = {
105+
"lam/col1": 4.799603622475375,
106+
"lam/col2": 1.5503043695984915,
107+
"tol": 0.07796932033627668,
108+
"max_iter": 100,
109+
"norm": "L1",
110+
}
111+
assert result_hp == result_expected
93112

94113

95114
@pytest.mark.parametrize("df", [df_corrupted])
96115
def test_benchmark_cross_validation_fit_transform(df: pd.DataFrame) -> None:
97116
result_cv = cv.fit_transform(df)
98-
result = np.array(result_cv)
99-
result_expected = np.array([[0, 1.5], [0, 1.5], [0, 1.5], [0, 1.5], [0, 1.5]])
100-
np.testing.assert_allclose(result, result_expected, atol=1e-5)
117+
result_expected = pd.DataFrame({"col1": [0, 2, 2, 4, 2], "col2": [1.5, 1.5, 1.5, 1.5, 1.5]})
118+
np.testing.assert_allclose(result_cv, result_expected, atol=1e-5)

0 commit comments

Comments
 (0)