Skip to content

Commit 7fae887

Browse files
Julien RousselJulien Roussel
authored andcommitted
method kl forest removed, again
1 parent ecef346 commit 7fae887

File tree

3 files changed

+23
-108
lines changed

3 files changed

+23
-108
lines changed

examples/benchmark.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -191,8 +191,7 @@ Concretely, the comparator takes as input a dataframe to impute, a proportion of
191191
Note these metrics compute reconstruction errors; it tells nothing about the distances between the "true" and "imputed" distributions.
192192

193193
```python
194-
metrics = ["mae", "wmape", "KL_columnwise", "KL_forest", "ks_test", "dist_corr_pattern"]
195-
# metrics = ["KL_forest"]
194+
metrics = ["mae", "wmape", "KL_columnwise", "ks_test", "dist_corr_pattern"]
196195
comparison = comparator.Comparator(
197196
dict_imputers,
198197
cols_to_impute,

qolmat/benchmark/metrics.py

Lines changed: 14 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@
1919

2020

2121
def columnwise_metric(
22-
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame, metric: Callable, **kwargs
22+
df1: pd.DataFrame,
23+
df2: pd.DataFrame,
24+
df_mask: pd.DataFrame,
25+
metric: Callable,
26+
**kwargs,
2327
) -> pd.Series:
2428
"""For each column, compute a metric score based on the true dataframe
2529
and the predicted dataframe
@@ -171,7 +175,10 @@ def weighted_mean_absolute_percentage_error(
171175

172176

173177
def dist_wasserstein(
174-
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame, method: str = "columnwise"
178+
df1: pd.DataFrame,
179+
df2: pd.DataFrame,
180+
df_mask: pd.DataFrame,
181+
method: str = "columnwise",
175182
) -> pd.Series:
176183
"""Wasserstein distances between columns of 2 dataframes.
177184
Wasserstein distance can only be computed columnwise
@@ -651,7 +658,10 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
651658

652659

653660
def sum_pairwise_distances(
654-
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame, metric: str = "cityblock"
661+
df1: pd.DataFrame,
662+
df2: pd.DataFrame,
663+
df_mask: pd.DataFrame,
664+
metric: str = "cityblock",
655665
) -> float:
656666
"""Sum of pairwise distances based on a predefined metric.
657667
Metrics are found in this link
@@ -766,50 +776,6 @@ def frechet_distance_pattern(
766776
return pd.Series(distance, index=["All"])
767777

768778

769-
def density_from_rf(
770-
df: pd.DataFrame, estimator: BaseEnsemble, df_est: Optional[pd.DataFrame] = None
771-
):
772-
"""Estimates the density of the empirical distribution given by df at the sample points given
773-
by df_est. The estimation uses an random forest estimator and relies on the average number of
774-
samples in the leaf corresponding to each estimation point.
775-
776-
Disclaimer: this method is experimental and has no known theoretical grounds
777-
778-
Parameters
779-
----------
780-
df : pd.DataFrame
781-
Empirical distribution which density should be estimated
782-
estimator : BaseEnsemble
783-
Estimator defining the forest upon which is based the density counting.
784-
df_est : pd.DataFrame, optional
785-
Sample points of the estimation, by default None
786-
If None, the density is estimated at the points given by `df`.
787-
788-
Returns
789-
-------
790-
pd.Series
791-
Series of floats providing the normalized density
792-
"""
793-
if df_est is None:
794-
df_est = df.copy()
795-
if df_est.index.names == [None]:
796-
cols_index = ["index"]
797-
else:
798-
cols_index = df_est.index.names
799-
counts = pd.Series(0, index=df_est.index)
800-
df_leafs = pd.DataFrame(estimator.apply(df), index=df.index)
801-
df_leafs_est = pd.DataFrame(estimator.apply(df_est), index=df_est.index)
802-
for i_tree in range(estimator.n_estimators):
803-
leafs = df_leafs[i_tree].rename("id_leaf")
804-
leafs_est = df_leafs_est[i_tree].rename("id_leaf")
805-
counts_leafs = leafs.value_counts().rename("count")
806-
df_merge = pd.merge(leafs_est.reset_index(), counts_leafs.reset_index(), on="id_leaf")
807-
df_merge = df_merge.set_index(cols_index)
808-
counts += df_merge["count"]
809-
counts /= counts.sum()
810-
return counts
811-
812-
813779
def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> float:
814780
"""Estimation of the Kullback-Leibler divergence between the two 1D empirical distributions
815781
given by `df1`and `df2`. The samples are binarized using a uniform spacing with 20 bins from
@@ -896,45 +862,6 @@ def kl_divergence_gaussian(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.Ser
896862
return div_kl
897863

898864

899-
def kl_divergence_forest(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> float:
900-
"""Kullback-Leibler divergence estimation based on a random forest fitted on the first
901-
empirical distribution
902-
903-
Disclaimer: this method is experimental and has no known theoretical grounds
904-
905-
Parameters
906-
----------
907-
df1 : pd.DataFrame
908-
First empirical distribution
909-
df2 : pd.DataFrame
910-
Second empirical distribution
911-
df_mask: pd.DataFrame
912-
Mask indicating on what values the divergence should be computed
913-
914-
Returns
915-
-------
916-
pd.Series
917-
Series of estimated metrics
918-
"""
919-
df1 = df1[df_mask.any(axis=1)]
920-
df2 = df2[df_mask.any(axis=1)]
921-
# df_1 = StandardScaler().fit_transform(df1[df_mask.any(axis=1)])
922-
# df_2 = StandardScaler().fit_transform(df2[df_mask.any(axis=1)])
923-
n_estimators = 100
924-
# estimator = sklearn.ensemble.RandomForestClassifier(
925-
# n_estimators=n_estimators, max_depth=10
926-
# )
927-
# X = pd.concat([df1, df2])
928-
# y = pd.concat([pd.Series([False] * len(df1)), pd.Series([True] * len(df2))])
929-
# estimator.fit(X, y)
930-
estimator = sklearn.ensemble.RandomTreesEmbedding(n_estimators=n_estimators, random_state=123)
931-
estimator.fit(df1)
932-
counts1 = density_from_rf(df1, estimator, df_est=df2)
933-
counts2 = density_from_rf(df2, estimator, df_est=df2)
934-
div_kl = np.mean(np.log(counts1 / counts2) * counts1 / counts2)
935-
return div_kl
936-
937-
938865
def kl_divergence(
939866
df1: pd.DataFrame,
940867
df2: pd.DataFrame,
@@ -948,7 +875,6 @@ def kl_divergence(
948875
- columnwise, relying on a uniform binarization and only taking marginals into account
949876
(https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence),
950877
- gaussian, relying on a Gaussian approximation,
951-
- random_forest, experimental
952878
953879
Parameters
954880
----------
@@ -991,14 +917,10 @@ def kl_divergence(
991917
kl_divergence_gaussian,
992918
min_n_rows=min_n_rows,
993919
)
994-
elif method == "random_forest":
995-
return pattern_based_weighted_mean_metric(
996-
df1, df2, df_mask, kl_divergence_forest, min_n_rows=min_n_rows
997-
)
998920
else:
999921
raise AssertionError(
1000922
f"The parameter of the function wasserstein_distance should be one of"
1001-
f"the following: [`columnwise`, `gaussian`, `random_forest`], not `{method}`!"
923+
f"the following: [`columnwise`, `gaussian`], not `{method}`!"
1002924
)
1003925

1004926

@@ -1086,7 +1008,6 @@ def get_metric(name: str) -> Callable:
10861008
"wasserstein_columnwise": dist_wasserstein,
10871009
"KL_columnwise": partial(kl_divergence, method="columnwise"),
10881010
"KL_gaussian": partial(kl_divergence, method="gaussian"),
1089-
"KL_forest": partial(kl_divergence, method="random_forest"),
10901011
"ks_test": kolmogorov_smirnov_test,
10911012
"correlation_diff": mean_difference_correlation_matrix_numerical_features,
10921013
"energy": sum_energy_distances,

tests/benchmark/test_metrics.py

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,10 @@
2020
df_imputed = pd.DataFrame({"col1": [0, 1, 2, 3.5, 4], "col2": [-1.5, 0, 1.5, 2, 1.5]})
2121

2222
df_mask = pd.DataFrame(
23-
{"col1": [False, False, True, True, False], "col2": [True, False, True, True, False]}
23+
{
24+
"col1": [False, False, True, True, False],
25+
"col2": [True, False, True, True, False],
26+
}
2427
)
2528

2629

@@ -131,17 +134,6 @@ def test_kl_divergence_gaussian(
131134
np.testing.assert_allclose(result, 1.371, atol=1e-3)
132135

133136

134-
@pytest.mark.parametrize("df1", [df_incomplete])
135-
@pytest.mark.parametrize("df2", [df_imputed])
136-
@pytest.mark.parametrize("df_mask", [df_mask])
137-
def test_kl_divergence_forest(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> None:
138-
result = metrics.kl_divergence_forest(df1, df1, df_mask)
139-
np.testing.assert_allclose(result, 0, atol=1e-3)
140-
141-
result = metrics.kl_divergence_forest(df1, df2, df_mask)
142-
np.testing.assert_allclose(result, 6.21e-2, rtol=1e-2)
143-
144-
145137
@pytest.mark.parametrize("df1", [df_incomplete])
146138
@pytest.mark.parametrize("df2", [df_imputed])
147139
@pytest.mark.parametrize("df_mask", [df_mask])
@@ -230,7 +222,10 @@ def test_mean_difference_correlation_matrix_numerical_features(
230222
)
231223

232224
df_mask_cat = pd.DataFrame(
233-
{"col1": [False, False, True, True, False], "col2": [True, False, True, True, False]}
225+
{
226+
"col1": [False, False, True, True, False],
227+
"col2": [True, False, True, True, False],
228+
}
234229
)
235230

236231

0 commit comments

Comments
 (0)