Skip to content

Commit 550856b

Browse files
Julien RousselJulien Roussel
authored andcommitted
readme results made coherent
1 parent 7fb0c56 commit 550856b

File tree

8 files changed

+107
-91
lines changed

8 files changed

+107
-91
lines changed

README.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -88,9 +88,8 @@ With just these few lines of code, you can see how easy it is to
8888
generator_holes = missing_patterns.EmpiricalHoleGenerator(n_splits=4, ratio_masked=0.1)
8989
comparison = comparator.Comparator(
9090
dict_imputers,
91-
columns,
9291
generator_holes = generator_holes,
93-
metrics = ["mae", "wmape", "kl_columnwise", "ks_test", "energy"],
92+
metrics = ["mae", "wmape", "kl_columnwise", "frechet"],
9493
)
9594
results = comparison.compare(df_with_nan)
9695
results.style.highlight_min(color="lightsteelblue", axis=1)

examples/tutorials/plot_tuto_diffusion_models.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,6 @@
169169

170170
comparison = comparator.Comparator(
171171
dict_imputers,
172-
selected_columns=df_data.columns,
173172
generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2, random_state=rng),
174173
metrics=["mae", "kl_columnwise"],
175174
)
@@ -224,7 +223,6 @@
224223

225224
comparison = comparator.Comparator(
226225
dict_imputers,
227-
selected_columns=df_data.columns,
228226
generator_holes=missing_patterns.UniformHoleGenerator(n_splits=2, random_state=rng),
229227
metrics=["mae", "kl_columnwise"],
230228
)

qolmat/benchmark/comparator.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,9 +28,6 @@ class Comparator:
2828
----------
2929
dict_models: Dict[str, any]
3030
dictionary of imputation methods
31-
selected_columns: List[str]Œ
32-
list of column's names selected (all with at least one null value will
33-
be imputed)
3431
columnwise_evaluation : Optional[bool], optional
3532
whether the metric should be calculated column-wise or not,
3633
by default False
@@ -46,7 +43,6 @@ class Comparator:
4643
def __init__(
4744
self,
4845
dict_models: Dict[str, Any],
49-
selected_columns: List[str],
5046
generator_holes: _HoleGenerator,
5147
metrics: List = ["mae", "wmape", "kl_columnwise"],
5248
dict_config_opti: Optional[Dict[str, Any]] = {},
@@ -55,7 +51,6 @@ def __init__(
5551
verbose: bool = False,
5652
):
5753
self.dict_imputers = dict_models
58-
self.selected_columns = selected_columns
5954
self.generator_holes = generator_holes
6055
self.metrics = metrics
6156
self.dict_config_opti = dict_config_opti

qolmat/benchmark/metrics.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -835,6 +835,7 @@ def sum_pairwise_distances(
835835
def frechet_distance_base(
836836
df1: pd.DataFrame,
837837
df2: pd.DataFrame,
838+
df_mask: pd.DataFrame,
838839
) -> pd.Series:
839840
"""Compute the Fréchet distance between two dataframes df1 and df2.
840841
@@ -853,16 +854,24 @@ def frechet_distance_base(
853854
true dataframe
854855
df2 : pd.DataFrame
855856
predicted dataframe
857+
df_mask : pd.DataFrame
858+
Elements of the dataframes to compute on
856859
857860
Returns
858861
-------
859862
pd.Series
860863
Frechet distance in a Series object
861864
862865
"""
863-
if df1.shape != df2.shape:
866+
if df1.shape != df2.shape or df1.shape != df_mask.shape:
864867
raise Exception("inputs have to be of same dimensions.")
865868

869+
df1 = df1.copy()
870+
df2 = df2.copy()
871+
# Set to nan the values not in the mask
872+
df1[~df_mask] = np.nan
873+
df2[~df_mask] = np.nan
874+
866875
std = (np.std(df1) + np.std(df2) + EPS) / 2
867876
mu = (np.nanmean(df1, axis=0) + np.nanmean(df2, axis=0)) / 2
868877
df1 = (df1 - mu) / std
@@ -911,7 +920,7 @@ def frechet_distance(
911920
912921
"""
913922
if method == "single":
914-
return frechet_distance_base(df1, df2)
923+
return frechet_distance_base(df1, df2, df_mask)
915924
return pattern_based_weighted_mean_metric(
916925
df1,
917926
df2,

qolmat/imputations/imputers_pytorch.py

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import pandas as pd
99
from numpy.typing import NDArray
1010
from sklearn.preprocessing import StandardScaler
11+
from tqdm import tqdm
1112

1213
# from typing_extensions import Self
1314
from qolmat.benchmark import metrics
@@ -106,23 +107,29 @@ def _fit_estimator(
106107
optimizer = optim.Adam(estimator.parameters(), lr=self.learning_rate)
107108
loss_fn = self.loss_fn
108109

109-
for epoch in range(self.epochs):
110-
estimator.train()
111-
optimizer.zero_grad()
112-
113-
input_data = torch.Tensor(X.values)
114-
target_data = torch.Tensor(y.values)
115-
target_data = target_data.unsqueeze(1)
116-
outputs = estimator(input_data)
117-
loss = loss_fn(outputs, target_data)
118-
119-
loss.backward()
120-
optimizer.step()
121-
if (epoch + 1) % 10 == 0:
122-
logging.info(
123-
f"Epoch [{epoch + 1}/{self.epochs}], "
124-
f"Loss: {loss.item():.4f}"
125-
)
110+
# if X.shape[0] != estimator[0].in_features:
111+
# raise ValueError(
112+
# "The number of features in X does not match the input "
113+
# "features of the estimator. The estimator expects"
114+
# f" {estimator[0].in_features} features, but X has "
115+
# f"{X.shape[0]} features."
116+
# )
117+
118+
with tqdm(total=self.epochs, desc="Training", unit="epoch") as pbar:
119+
for _ in range(self.epochs):
120+
estimator.train()
121+
optimizer.zero_grad()
122+
123+
input_data = torch.Tensor(X.values)
124+
target_data = torch.Tensor(y.values)
125+
target_data = target_data.unsqueeze(1)
126+
outputs = estimator(input_data)
127+
loss = loss_fn(outputs, target_data)
128+
129+
loss.backward()
130+
optimizer.step()
131+
pbar.set_postfix(loss=f"{loss.item():.4f}")
132+
pbar.update(1)
126133
return estimator
127134

128135
def _predict_estimator(

qolmat/imputations/rpca/rpca_noisy.py

Lines changed: 63 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -317,67 +317,73 @@ def minimise_loss(
317317
Ir = np.eye(rank)
318318
In = identity(n_rows)
319319

320-
for _ in tqdm(
321-
range(max_iterations),
320+
with tqdm(
321+
total=max_iterations,
322322
desc="Noisy RPCA loss minimization",
323+
unit="iteration",
323324
disable=not verbose,
324-
):
325-
M_temp = M.copy()
326-
A_temp = A.copy()
327-
L_temp = L.copy()
328-
Q_temp = Q.copy()
329-
if norm == "L1":
330-
R_temp = R.copy()
331-
sums = np.zeros((n_rows, n_cols))
332-
for i_period, _ in enumerate(list_periods):
333-
sums += mu * R[i_period] - list_H[i_period] @ Y
334-
335-
M = spsolve(
336-
(1 + mu) * In + HtH,
337-
D - A + mu * L @ Q - Y + sums,
338-
)
339-
else:
340-
M = spsolve(
341-
(1 + mu) * In + 2 * HtH,
342-
D - A + mu * L @ Q - Y,
343-
)
344-
M = M.reshape(D.shape)
345-
346-
A_Omega = rpca_utils.soft_thresholding(D - M, lam)
347-
A_Omega_C = D - M
348-
A = np.where(Omega, A_Omega, A_Omega_C)
349-
Q = scp.linalg.solve(
350-
a=tau * Ir + mu * (L.T @ L),
351-
b=L.T @ (mu * M + Y),
352-
)
353-
354-
L = scp.linalg.solve(
355-
a=tau * Ir + mu * (Q @ Q.T),
356-
b=Q @ (mu * M.T + Y.T),
357-
).T
358-
359-
Y += mu * (M - L @ Q)
360-
if norm == "L1":
361-
for i_period, _ in enumerate(list_periods):
362-
eta = list_etas[i_period]
363-
R[i_period] = rpca_utils.soft_thresholding(
364-
R[i_period] / mu, eta / mu
325+
) as pbar:
326+
for _ in range(max_iterations):
327+
M_temp = M.copy()
328+
A_temp = A.copy()
329+
L_temp = L.copy()
330+
Q_temp = Q.copy()
331+
if norm == "L1":
332+
R_temp = R.copy()
333+
sums = np.zeros((n_rows, n_cols))
334+
for i_period, _ in enumerate(list_periods):
335+
sums += mu * R[i_period] - list_H[i_period] @ Y
336+
337+
M = spsolve(
338+
(1 + mu) * In + HtH,
339+
D - A + mu * L @ Q - Y + sums,
365340
)
341+
else:
342+
M = spsolve(
343+
(1 + mu) * In + 2 * HtH,
344+
D - A + mu * L @ Q - Y,
345+
)
346+
M = M.reshape(D.shape)
347+
348+
A_Omega = rpca_utils.soft_thresholding(D - M, lam)
349+
A_Omega_C = D - M
350+
A = np.where(Omega, A_Omega, A_Omega_C)
351+
Q = scp.linalg.solve(
352+
a=tau * Ir + mu * (L.T @ L),
353+
b=L.T @ (mu * M + Y),
354+
)
366355

367-
mu = min(mu * rho, mu_bar)
368-
369-
Mc = np.linalg.norm(M - M_temp, np.inf)
370-
Ac = np.linalg.norm(A - A_temp, np.inf)
371-
Lc = np.linalg.norm(L - L_temp, np.inf)
372-
Qc = np.linalg.norm(Q - Q_temp, np.inf)
373-
error_max = max([Mc, Ac, Lc, Qc]) # type: ignore # noqa
374-
if norm == "L1":
375-
for i_period, _ in enumerate(list_periods):
376-
Rc = np.linalg.norm(R[i_period] - R_temp[i_period], np.inf)
377-
error_max = max(error_max, Rc) # type: ignore # noqa
378-
379-
if error_max < tolerance:
380-
break
356+
L = scp.linalg.solve(
357+
a=tau * Ir + mu * (Q @ Q.T),
358+
b=Q @ (mu * M.T + Y.T),
359+
).T
360+
361+
Y += mu * (M - L @ Q)
362+
if norm == "L1":
363+
for i_period, _ in enumerate(list_periods):
364+
eta = list_etas[i_period]
365+
R[i_period] = rpca_utils.soft_thresholding(
366+
R[i_period] / mu, eta / mu
367+
)
368+
369+
mu = min(mu * rho, mu_bar)
370+
371+
Mc = np.linalg.norm(M - M_temp, np.inf)
372+
Ac = np.linalg.norm(A - A_temp, np.inf)
373+
Lc = np.linalg.norm(L - L_temp, np.inf)
374+
Qc = np.linalg.norm(Q - Q_temp, np.inf)
375+
error_max = max([Mc, Ac, Lc, Qc]) # type: ignore # noqa
376+
if norm == "L1":
377+
for i_period, _ in enumerate(list_periods):
378+
Rc = np.linalg.norm(
379+
R[i_period] - R_temp[i_period], np.inf
380+
)
381+
error_max = max(error_max, Rc) # type: ignore # noqa
382+
383+
if error_max < tolerance:
384+
break
385+
pbar.set_postfix(error=f"{error_max.item():.4f}")
386+
pbar.update(1)
381387

382388
M = L @ Q
383389

tests/benchmark/test_comparator.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ def generator_holes_mock(mocker: MockerFixture) -> _HoleGenerator:
4040
def comparator(generator_holes_mock: _HoleGenerator) -> Comparator:
4141
return Comparator(
4242
dict_models={},
43-
selected_columns=["A", "B"],
4443
generator_holes=generator_holes_mock,
4544
metrics=["mae", "mse"],
4645
)
@@ -439,7 +438,6 @@ def test_compare_reproducibility():
439438
)
440439
comparator = Comparator(
441440
dict_models=dict_models,
442-
selected_columns=df_data.columns,
443441
generator_holes=generator_holes,
444442
metrics=["mae", "mse"],
445443
)

tests/benchmark/test_metrics.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -171,12 +171,16 @@ def test_kl_divergence_gaussian(
171171

172172
@pytest.mark.parametrize("df1", [df_incomplete])
173173
@pytest.mark.parametrize("df2", [df_imputed])
174-
def test_frechet_distance_base(df1: pd.DataFrame, df2: pd.DataFrame) -> None:
175-
result = metrics.frechet_distance_base(df1, df1)
174+
@pytest.mark.parametrize("df_mask", [df_mask])
175+
def test_frechet_distance_base(
176+
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
177+
) -> None:
178+
result = metrics.frechet_distance_base(df1, df1, df_mask)
176179
np.testing.assert_allclose(result, 0, atol=1e-3)
177180

178-
result = metrics.frechet_distance_base(df1, df2)
179-
np.testing.assert_allclose(result, 0.134, atol=1e-3)
181+
result = metrics.frechet_distance_base(df1, df2, df_mask)
182+
assert np.all(0 < result)
183+
assert np.all(result < 1)
180184

181185

182186
@pytest.mark.parametrize("df1", [df_incomplete])

0 commit comments

Comments
 (0)