1919
2020
2121def columnwise_metric (
22- df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame , metric : Callable , ** kwargs
22+ df1 : pd .DataFrame ,
23+ df2 : pd .DataFrame ,
24+ df_mask : pd .DataFrame ,
25+ metric : Callable ,
26+ ** kwargs ,
2327) -> pd .Series :
2428 """For each column, compute a metric score based on the true dataframe
2529 and the predicted dataframe
@@ -171,7 +175,10 @@ def weighted_mean_absolute_percentage_error(
171175
172176
173177def dist_wasserstein (
174- df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame , method : str = "columnwise"
178+ df1 : pd .DataFrame ,
179+ df2 : pd .DataFrame ,
180+ df_mask : pd .DataFrame ,
181+ method : str = "columnwise" ,
175182) -> pd .Series :
176183 """Wasserstein distances between columns of 2 dataframes.
177184 Wasserstein distance can only be computed columnwise
@@ -651,7 +658,10 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
651658
652659
653660def sum_pairwise_distances (
654- df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame , metric : str = "cityblock"
661+ df1 : pd .DataFrame ,
662+ df2 : pd .DataFrame ,
663+ df_mask : pd .DataFrame ,
664+ metric : str = "cityblock" ,
655665) -> float :
656666 """Sum of pairwise distances based on a predefined metric.
657667 Metrics are found in this link
@@ -766,50 +776,6 @@ def frechet_distance_pattern(
766776 return pd .Series (distance , index = ["All" ])
767777
768778
769- def density_from_rf (
770- df : pd .DataFrame , estimator : BaseEnsemble , df_est : Optional [pd .DataFrame ] = None
771- ):
772- """Estimates the density of the empirical distribution given by df at the sample points given
773- by df_est. The estimation uses an random forest estimator and relies on the average number of
774- samples in the leaf corresponding to each estimation point.
775-
776- Disclaimer: this method is experimental and has no known theoretical grounds
777-
778- Parameters
779- ----------
780- df : pd.DataFrame
781- Empirical distribution which density should be estimated
782- estimator : BaseEnsemble
783- Estimator defining the forest upon which is based the density counting.
784- df_est : pd.DataFrame, optional
785- Sample points of the estimation, by default None
786- If None, the density is estimated at the points given by `df`.
787-
788- Returns
789- -------
790- pd.Series
791- Series of floats providing the normalized density
792- """
793- if df_est is None :
794- df_est = df .copy ()
795- if df_est .index .names == [None ]:
796- cols_index = ["index" ]
797- else :
798- cols_index = df_est .index .names
799- counts = pd .Series (0 , index = df_est .index )
800- df_leafs = pd .DataFrame (estimator .apply (df ), index = df .index )
801- df_leafs_est = pd .DataFrame (estimator .apply (df_est ), index = df_est .index )
802- for i_tree in range (estimator .n_estimators ):
803- leafs = df_leafs [i_tree ].rename ("id_leaf" )
804- leafs_est = df_leafs_est [i_tree ].rename ("id_leaf" )
805- counts_leafs = leafs .value_counts ().rename ("count" )
806- df_merge = pd .merge (leafs_est .reset_index (), counts_leafs .reset_index (), on = "id_leaf" )
807- df_merge = df_merge .set_index (cols_index )
808- counts += df_merge ["count" ]
809- counts /= counts .sum ()
810- return counts
811-
812-
813779def kl_divergence_1D (df1 : pd .Series , df2 : pd .Series ) -> float :
814780 """Estimation of the Kullback-Leibler divergence between the two 1D empirical distributions
815781 given by `df1`and `df2`. The samples are binarized using a uniform spacing with 20 bins from
@@ -896,45 +862,6 @@ def kl_divergence_gaussian(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.Ser
896862 return div_kl
897863
898864
899- def kl_divergence_forest (df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame ) -> float :
900- """Kullback-Leibler divergence estimation based on a random forest fitted on the first
901- empirical distribution
902-
903- Disclaimer: this method is experimental and has no known theoretical grounds
904-
905- Parameters
906- ----------
907- df1 : pd.DataFrame
908- First empirical distribution
909- df2 : pd.DataFrame
910- Second empirical distribution
911- df_mask: pd.DataFrame
912- Mask indicating on what values the divergence should be computed
913-
914- Returns
915- -------
916- pd.Series
917- Series of estimated metrics
918- """
919- df1 = df1 [df_mask .any (axis = 1 )]
920- df2 = df2 [df_mask .any (axis = 1 )]
921- # df_1 = StandardScaler().fit_transform(df1[df_mask.any(axis=1)])
922- # df_2 = StandardScaler().fit_transform(df2[df_mask.any(axis=1)])
923- n_estimators = 100
924- # estimator = sklearn.ensemble.RandomForestClassifier(
925- # n_estimators=n_estimators, max_depth=10
926- # )
927- # X = pd.concat([df1, df2])
928- # y = pd.concat([pd.Series([False] * len(df1)), pd.Series([True] * len(df2))])
929- # estimator.fit(X, y)
930- estimator = sklearn .ensemble .RandomTreesEmbedding (n_estimators = n_estimators , random_state = 123 )
931- estimator .fit (df1 )
932- counts1 = density_from_rf (df1 , estimator , df_est = df2 )
933- counts2 = density_from_rf (df2 , estimator , df_est = df2 )
934- div_kl = np .mean (np .log (counts1 / counts2 ) * counts1 / counts2 )
935- return div_kl
936-
937-
938865def kl_divergence (
939866 df1 : pd .DataFrame ,
940867 df2 : pd .DataFrame ,
@@ -948,7 +875,6 @@ def kl_divergence(
948875 - columnwise, relying on a uniform binarization and only taking marginals into account
949876 (https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence),
950877 - gaussian, relying on a Gaussian approximation,
951- - random_forest, experimental
952878
953879 Parameters
954880 ----------
@@ -991,14 +917,10 @@ def kl_divergence(
991917 kl_divergence_gaussian ,
992918 min_n_rows = min_n_rows ,
993919 )
994- elif method == "random_forest" :
995- return pattern_based_weighted_mean_metric (
996- df1 , df2 , df_mask , kl_divergence_forest , min_n_rows = min_n_rows
997- )
998920 else :
999921 raise AssertionError (
1000922 f"The parameter of the function wasserstein_distance should be one of"
1001- f"the following: [`columnwise`, `gaussian`, `random_forest` ], not `{ method } `!"
923+ f"the following: [`columnwise`, `gaussian`], not `{ method } `!"
1002924 )
1003925
1004926
@@ -1086,7 +1008,6 @@ def get_metric(name: str) -> Callable:
10861008 "wasserstein_columnwise" : dist_wasserstein ,
10871009 "KL_columnwise" : partial (kl_divergence , method = "columnwise" ),
10881010 "KL_gaussian" : partial (kl_divergence , method = "gaussian" ),
1089- "KL_forest" : partial (kl_divergence , method = "random_forest" ),
10901011 "ks_test" : kolmogorov_smirnov_test ,
10911012 "correlation_diff" : mean_difference_correlation_matrix_numerical_features ,
10921013 "energy" : sum_energy_distances ,
0 commit comments