Skip to content

Commit 1771a34

Browse files
author
vm-aifluence-jro
committed
code review: parameter method added
1 parent 2db275d commit 1771a34

File tree

8 files changed

+94
-98
lines changed

8 files changed

+94
-98
lines changed

.pre-commit-config.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,3 @@ repos:
2323
rev: v1.1.1
2424
hooks:
2525
- id: mypy
26-
exclude: (tests/old_tests)

examples/benchmark.md

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ imputer_mice = imputers.ImputerMICE(groups=["station"], estimator=LinearRegressi
144144
imputer_regressor = imputers.ImputerRegressor(groups=["station"], estimator=LinearRegression())
145145

146146
dict_imputers = {
147-
# "mean": imputer_mean,
147+
"mean": imputer_mean,
148148
# "median": imputer_median,
149149
# "mode": imputer_mode,
150150
"interpolation": imputer_interpol,
@@ -160,7 +160,7 @@ dict_imputers = {
160160
# "nocb": imputer_nocb,
161161
# "knn": imputer_knn,
162162
"ols": imputer_regressor,
163-
"mice_ols": imputer_mice,
163+
# "mice_ols": imputer_mice,
164164
}
165165
n_imputers = len(dict_imputers)
166166

@@ -193,14 +193,24 @@ comparison = comparator.Comparator(
193193
dict_imputers,
194194
cols_to_impute,
195195
generator_holes = generator_holes,
196-
metrics=["mae", "wmape", "KL"],
196+
metrics=["mae", "wmape", "KL", "ks_test", "energy"],
197197
n_calls_opt=10,
198198
search_params=search_params,
199199
)
200200
results = comparison.compare(df_data)
201201
results
202202
```
203203

204+
```python
205+
df_plot
206+
```
207+
208+
```python
209+
df_plot = results.loc["energy", "All"]
210+
plt.bar(df_plot.index, df_plot, color=tab10(0))
211+
plt.show()
212+
```
213+
204214
```python
205215
fig = plt.figure(figsize=(24, 8))
206216
fig.add_subplot(2, 1, 1)

examples/metrics_usage.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ from sklearn.linear_model import LinearRegression
2828
from qolmat.utils import data, plot, utils
2929
from qolmat.imputations import imputers
3030
from qolmat.benchmark import comparator, missing_patterns
31-
from qolmat.benchmark.utils import wasser_distance, kl_divergence, frechet_distance
31+
from qolmat.benchmark.utils import wasser_distance_columnwise, kl_divergence, frechet_distance
3232
```
3333

3434
```python
@@ -106,7 +106,7 @@ ratio_masked = 0.1
106106
```python
107107
# Métriques
108108
metrics = {
109-
"wasser": wasser_distance,
109+
"wasserstein_columnwise": wasserstein_distance_columnwise,
110110
"KL": kl_divergence
111111
#"frechet": frechet_distance
112112
}

qolmat/benchmark/comparator.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from functools import partial
23
from typing import Any, Dict, List, Optional, Union
34

45
import numpy as np
@@ -33,8 +34,9 @@ class Comparator:
3334
"rmse": metrics.root_mean_squared_error,
3435
"mae": metrics.mean_absolute_error,
3536
"wmape": metrics.weighted_mean_absolute_percentage_error,
36-
"wasser": metrics.wasser_distance,
37-
"KL": metrics.kl_divergence_columnwise,
37+
"wasserstein_columnwise": partial(metrics.wasserstein_distance, method="columnwise"),
38+
"KL_columnwise": partial(metrics.kl_divergence, method="columnwise"),
39+
"KL_gaussian": partial(metrics.kl_divergence, method="gaussian"),
3840
"ks_test": metrics.kolmogorov_smirnov_test,
3941
"correlation_diff": metrics.mean_difference_correlation_matrix_numerical_features,
4042
"pairwise_dist": metrics.sum_pairwise_distances,

qolmat/benchmark/metrics.py

Lines changed: 54 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import numpy as np
44
import pandas as pd
55
import scipy
6-
76
from sklearn import metrics as skm
87
from sklearn.preprocessing import StandardScaler
98

@@ -105,7 +104,9 @@ def weighted_mean_absolute_percentage_error(
105104
return columnwise_metric(df1, df2, df_mask, skm.mean_absolute_percentage_error)
106105

107106

108-
def wasser_distance(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series:
107+
def wasserstein_distance(
108+
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame, method: str = "columnwise"
109+
) -> pd.Series:
109110
"""Wasserstein distances between columns of 2 dataframes.
110111
Wasserstein distance can only be computed columnwise
111112
@@ -122,7 +123,13 @@ def wasser_distance(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame)
122123
-------
123124
wasserstein distances : pd.Series
124125
"""
125-
return columnwise_metric(df1, df2, df_mask, scipy.stats.wasserstein_distance)
126+
if method == "columnwise":
127+
return columnwise_metric(df1, df2, df_mask, scipy.stats.wasserstein_distance)
128+
else:
129+
raise AssertionError(
130+
f"The parameter of the function wasserstein_distance should be one of"
131+
f"the following: [`columnwise`], not `{method}`!"
132+
)
126133

127134

128135
def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
@@ -134,30 +141,9 @@ def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
134141
return scipy.stats.entropy(p + EPS, q + EPS)
135142

136143

137-
def kl_divergence_columnwise(
138-
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
144+
def kl_divergence(
145+
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame, method: str = "columnwise"
139146
) -> pd.Series:
140-
"""TODO documentation
141-
Kullback-Leibler divergence between distributions
142-
If multivariate normal distributions:
143-
https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
144-
145-
Parameters
146-
----------
147-
df1 : pd.DataFrame
148-
df2 : pd.DataFrame
149-
columnwise_evaluation: Optional[bool]
150-
if the evalutation is computed column-wise. By default, is set to False
151-
152-
Returns
153-
-------
154-
Kullback-Leibler divergence : Union[float, pd.Series]
155-
"""
156-
157-
return columnwise_metric(df1, df2, df_mask, kl_divergence_1D)
158-
159-
160-
def kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series:
161147
"""TODO Documentation
162148
Kullback-Leibler divergence between distributions
163149
If multivariate normal distributions:
@@ -174,22 +160,30 @@ def kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -
174160
-------
175161
Kullback-Leibler divergence : Union[float, pd.Series]
176162
"""
177-
cols = df1.columns.tolist()
178-
df_1 = StandardScaler().fit_transform(df1[df_mask.any(axis=1)])
179-
df_2 = StandardScaler().fit_transform(df2[df_mask.any(axis=1)])
180-
181-
n = df_1.shape[0]
182-
mu_true = np.nanmean(df_1, axis=0)
183-
sigma_true = np.ma.cov(np.ma.masked_invalid(df_1), rowvar=False).data
184-
mu_pred = np.nanmean(df_2, axis=0)
185-
sigma_pred = np.ma.cov(np.ma.masked_invalid(df_2), rowvar=False).data
186-
diff = mu_true - mu_pred
187-
inv_sigma_pred = np.linalg.inv(sigma_pred)
188-
quad_term = diff.T @ inv_sigma_pred @ diff
189-
trace_term = np.trace(inv_sigma_pred @ sigma_true)
190-
det_term = np.log(np.linalg.det(sigma_pred) / np.linalg.det(sigma_true))
191-
kl = 0.5 * (quad_term + trace_term + det_term - n)
192-
return pd.Series(kl, index=cols)
163+
if method == "columnwise":
164+
return columnwise_metric(df1, df2, df_mask, kl_divergence_1D)
165+
elif method == "gaussian":
166+
cols = df1.columns.tolist()
167+
df_1 = StandardScaler().fit_transform(df1[df_mask.any(axis=1)])
168+
df_2 = StandardScaler().fit_transform(df2[df_mask.any(axis=1)])
169+
170+
n = df_1.shape[0]
171+
mu_true = np.nanmean(df_1, axis=0)
172+
sigma_true = np.ma.cov(np.ma.masked_invalid(df_1), rowvar=False).data
173+
mu_pred = np.nanmean(df_2, axis=0)
174+
sigma_pred = np.ma.cov(np.ma.masked_invalid(df_2), rowvar=False).data
175+
diff = mu_true - mu_pred
176+
inv_sigma_pred = np.linalg.inv(sigma_pred)
177+
quad_term = diff.T @ inv_sigma_pred @ diff
178+
trace_term = np.trace(inv_sigma_pred @ sigma_true)
179+
det_term = np.log(np.linalg.det(sigma_pred) / np.linalg.det(sigma_true))
180+
kl = 0.5 * (quad_term + trace_term + det_term - n)
181+
return pd.Series(kl, index=cols)
182+
else:
183+
raise AssertionError(
184+
f"The parameter of the function wasserstein_distance should be one of"
185+
f"the following: [`columnwise`, `gaussian`], not `{method}`!"
186+
)
193187

194188

195189
def _get_numerical_features(df1: pd.DataFrame) -> List[str]:
@@ -242,7 +236,7 @@ def _get_categorical_features(df1: pd.DataFrame) -> List[str]:
242236
return cols_categorical
243237

244238

245-
def _kolmogorov_smirnov_test(df1: pd.Series, df2: pd.Series) -> float:
239+
def kolmogorov_smirnov_test_1D(df1: pd.Series, df2: pd.Series) -> float:
246240
"""Compute KS test statistic of the two-sample Kolmogorov-Smirnov test for goodness of fit.
247241
See more in https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html.
248242
@@ -283,11 +277,14 @@ def kolmogorov_smirnov_test(
283277
"""
284278
cols_numerical = _get_numerical_features(df1)
285279
return columnwise_metric(
286-
df1[cols_numerical], df2[cols_numerical], df_mask[cols_numerical], _kolmogorov_smirnov_test
280+
df1[cols_numerical],
281+
df2[cols_numerical],
282+
df_mask[cols_numerical],
283+
kolmogorov_smirnov_test_1D,
287284
)
288285

289286

290-
def _total_variance_distance(df1: pd.Series, df2: pd.Series) -> float:
287+
def total_variance_distance_1D(df1: pd.Series, df2: pd.Series) -> float:
291288
"""Compute Total Variance Distance for a categorical feature
292289
It is based on TVComplement in https://github.com/sdv-dev/SDMetrics
293290
@@ -337,7 +334,7 @@ def total_variance_distance(
337334
df1[cols_categorical],
338335
df2[cols_categorical],
339336
df_mask[cols_categorical],
340-
_total_variance_distance,
337+
total_variance_distance_1D,
341338
)
342339

343340

@@ -564,20 +561,20 @@ def mean_difference_correlation_matrix_categorical_vs_numerical_features(
564561

565562

566563
###########################
567-
# Row-wise metris #
564+
# Row-wise metrics #
568565
###########################
569566

570567

571-
def _sum_distance_col(col: pd.Series, col_size: int) -> float:
572-
col = col.sort_values(ascending=True)
573-
sums_partial = col.shift().fillna(0.0).cumsum()
574-
differences_partial = col * np.arange(col_size) - sums_partial
568+
def _sum_manhattan_distances_1D(values: pd.Series) -> float:
569+
values = values.sort_values(ascending=True)
570+
sums_partial = values.shift().fillna(0.0).cumsum()
571+
differences_partial = values * np.arange(len(values)) - sums_partial
575572
res = differences_partial.sum()
576573
return res
577574

578575

579576
def _sum_manhattan_distances(df1: pd.DataFrame) -> float:
580-
"""Sum Manhattan distances.
577+
"""Sum Manhattan distances beetween all pairs of rows.
581578
It is based on https://www.geeksforgeeks.org/sum-manhattan-distances-pairs-points/
582579
583580
Parameters
@@ -586,10 +583,8 @@ def _sum_manhattan_distances(df1: pd.DataFrame) -> float:
586583
_description_
587584
"""
588585
cols = df1.columns.tolist()
589-
sum = 0.0
590-
for col in cols:
591-
sum += _sum_distance_col(df1[col], len(df1))
592-
return sum
586+
result = sum([_sum_manhattan_distances_1D(df1[col]) for col in cols])
587+
return result
593588

594589

595590
def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series:
@@ -613,9 +608,8 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
613608
df1 = df1[df_mask].fillna(0.0)
614609
df2 = df2[df_mask].fillna(0.0)
615610

616-
sum_distances_df1 = _sum_manhattan_distances(
617-
df1
618-
) # sum of (len_df1 * (len_df1 - 1) / 2) distances for df1
611+
# sum of (len_df1 * (len_df1 - 1) / 2) distances for df1
612+
sum_distances_df1 = _sum_manhattan_distances(df1)
619613
sum_distances_df2 = _sum_manhattan_distances(df2)
620614

621615
df = pd.concat([df1, df2])
@@ -654,7 +648,7 @@ def sum_pairwise_distances(
654648

655649

656650
###########################
657-
# Dataframe-wise metris #
651+
# Dataframe-wise metrics #
658652
###########################
659653

660654

tests/benchmark/test_metrics.py

Lines changed: 16 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22
# # Evaluation metrics #
33
# ######################
44

5-
import pandas as pd
65
import numpy as np
7-
import scipy
6+
import pandas as pd
87
import pytest
8+
import scipy
99

1010
from qolmat.benchmark import metrics
1111

@@ -71,25 +71,19 @@ def test_weighted_mean_absolute_percentage_error(
7171
assert metrics.weighted_mean_absolute_percentage_error(df1, df1, df_mask).equals(
7272
pd.Series([0.0, 0.0], index=["col1", "col2"])
7373
)
74-
assert (
75-
metrics.weighted_mean_absolute_percentage_error(df1, df2, df_mask)
76-
.round(3)
77-
.equals(pd.Series([0.083, 1.167], index=["col1", "col2"]))
78-
)
74+
result = metrics.weighted_mean_absolute_percentage_error(df1, df2, df_mask)
75+
expected = pd.Series([0.083, 1.167], index=["col1", "col2"])
76+
np.testing.assert_allclose(result, expected, atol=1e-3)
7977

8078

8179
@pytest.mark.parametrize("df1", [df_incomplete])
8280
@pytest.mark.parametrize("df2", [df_imputed])
8381
@pytest.mark.parametrize("df_mask", [df_mask])
84-
def test_wasser_distance(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> None:
85-
assert metrics.wasser_distance(df1, df1, df_mask).equals(
86-
pd.Series([0.0, 0.0], index=["col1", "col2"])
87-
)
88-
assert (
89-
metrics.wasser_distance(df1, df2, df_mask)
90-
.round(3)
91-
.equals(pd.Series([0.250, 0.833], index=["col1", "col2"]))
92-
)
82+
def test_wasserstein_distance(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> None:
83+
dist = metrics.wasserstein_distance(df1, df1, df_mask, method="columnwise")
84+
assert dist.equals(pd.Series([0.0, 0.0], index=["col1", "col2"]))
85+
dist = metrics.wasserstein_distance(df1, df2, df_mask, method="columnwise")
86+
assert dist.round(3).equals(pd.Series([0.250, 0.833], index=["col1", "col2"]))
9387

9488

9589
@pytest.mark.parametrize("df1", [df_incomplete])
@@ -98,27 +92,25 @@ def test_wasser_distance(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
9892
def test_kl_divergence_columnwise(
9993
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
10094
) -> None:
101-
assert metrics.kl_divergence_columnwise(df1, df1, df_mask).equals(
95+
assert metrics.kl_divergence(df1, df1, df_mask, method="columnwise").equals(
10296
pd.Series([0.0, 0.0], index=["col1", "col2"])
10397
)
104-
assert (
105-
metrics.kl_divergence_columnwise(df1, df2, df_mask)
106-
.round(3)
107-
.equals(pd.Series([18.945, 36.637], index=["col1", "col2"]))
108-
)
98+
result = metrics.kl_divergence(df1, df2, df_mask, method="columnwise")
99+
expected = pd.Series([18.945, 36.637], index=["col1", "col2"])
100+
np.testing.assert_allclose(result, expected, atol=1e-3)
109101

110102

111103
@pytest.mark.parametrize("df1", [df_incomplete])
112104
@pytest.mark.parametrize("df2", [df_imputed])
113105
@pytest.mark.parametrize("df_mask", [df_mask])
114106
def test_kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> None:
115107
assert (
116-
metrics.kl_divergence(df1, df1, df_mask)
108+
metrics.kl_divergence(df1, df1, df_mask, method="gaussian")
117109
.round(2)
118110
.equals(pd.Series([-0.5, -0.5], index=["col1", "col2"]))
119111
)
120112
assert (
121-
metrics.kl_divergence(df1, df2, df_mask)
113+
metrics.kl_divergence(df1, df2, df_mask, method="gaussian")
122114
.round(3)
123115
.equals(pd.Series([0.263, 0.263], index=["col1", "col2"]))
124116
)

tests/imputations/test_imputers.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ def test_ImputerResiduals_fit_transform(df: pd.DataFrame) -> None:
154154
},
155155
index=pd.date_range("2023-04-17", periods=20, freq="D"),
156156
)
157-
np.testing.assert_allclose(result, expected, rtol=1e-6)
157+
np.testing.assert_allclose(result, expected, atol=1e-6)
158158

159159

160160
@pytest.mark.parametrize("df", [df_incomplete])
@@ -228,4 +228,4 @@ def test_ImputerEM_fit_transform(df: pd.DataFrame) -> None:
228228
"col2": [0, 1.914706, 2, 2.480963, 2] + [i for i in range(5, 20)],
229229
}
230230
)
231-
np.testing.assert_allclose(result, expected, rtol=1e-6)
231+
np.testing.assert_allclose(result, expected, atol=1e-6)

0 commit comments

Comments
 (0)