33import numpy as np
44import pandas as pd
55import scipy
6-
76from sklearn import metrics as skm
87from sklearn .preprocessing import StandardScaler
98
@@ -105,7 +104,9 @@ def weighted_mean_absolute_percentage_error(
105104 return columnwise_metric (df1 , df2 , df_mask , skm .mean_absolute_percentage_error )
106105
107106
108- def wasser_distance (df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame ) -> pd .Series :
107+ def wasserstein_distance (
108+ df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame , method : str = "columnwise"
109+ ) -> pd .Series :
109110 """Wasserstein distances between columns of 2 dataframes.
110111 Wasserstein distance can only be computed columnwise
111112
@@ -122,7 +123,13 @@ def wasser_distance(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame)
122123 -------
123124 wasserstein distances : pd.Series
124125 """
125- return columnwise_metric (df1 , df2 , df_mask , scipy .stats .wasserstein_distance )
126+ if method == "columnwise" :
127+ return columnwise_metric (df1 , df2 , df_mask , scipy .stats .wasserstein_distance )
128+ else :
129+ raise AssertionError (
130+ f"The parameter of the function wasserstein_distance should be one of"
131+ f"the following: [`columnwise`], not `{ method } `!"
132+ )
126133
127134
128135def kl_divergence_1D (df1 : pd .Series , df2 : pd .Series ) -> np .number :
@@ -134,30 +141,9 @@ def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
134141 return scipy .stats .entropy (p + EPS , q + EPS )
135142
136143
137- def kl_divergence_columnwise (
138- df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame
144+ def kl_divergence (
145+ df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame , method : str = "columnwise"
139146) -> pd .Series :
140- """TODO documentation
141- Kullback-Leibler divergence between distributions
142- If multivariate normal distributions:
143- https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
144-
145- Parameters
146- ----------
147- df1 : pd.DataFrame
148- df2 : pd.DataFrame
149- columnwise_evaluation: Optional[bool]
150- if the evalutation is computed column-wise. By default, is set to False
151-
152- Returns
153- -------
154- Kullback-Leibler divergence : Union[float, pd.Series]
155- """
156-
157- return columnwise_metric (df1 , df2 , df_mask , kl_divergence_1D )
158-
159-
160- def kl_divergence (df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame ) -> pd .Series :
161147 """TODO Documentation
162148 Kullback-Leibler divergence between distributions
163149 If multivariate normal distributions:
@@ -174,22 +160,30 @@ def kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -
174160 -------
175161 Kullback-Leibler divergence : Union[float, pd.Series]
176162 """
177- cols = df1 .columns .tolist ()
178- df_1 = StandardScaler ().fit_transform (df1 [df_mask .any (axis = 1 )])
179- df_2 = StandardScaler ().fit_transform (df2 [df_mask .any (axis = 1 )])
180-
181- n = df_1 .shape [0 ]
182- mu_true = np .nanmean (df_1 , axis = 0 )
183- sigma_true = np .ma .cov (np .ma .masked_invalid (df_1 ), rowvar = False ).data
184- mu_pred = np .nanmean (df_2 , axis = 0 )
185- sigma_pred = np .ma .cov (np .ma .masked_invalid (df_2 ), rowvar = False ).data
186- diff = mu_true - mu_pred
187- inv_sigma_pred = np .linalg .inv (sigma_pred )
188- quad_term = diff .T @ inv_sigma_pred @ diff
189- trace_term = np .trace (inv_sigma_pred @ sigma_true )
190- det_term = np .log (np .linalg .det (sigma_pred ) / np .linalg .det (sigma_true ))
191- kl = 0.5 * (quad_term + trace_term + det_term - n )
192- return pd .Series (kl , index = cols )
163+ if method == "columnwise" :
164+ return columnwise_metric (df1 , df2 , df_mask , kl_divergence_1D )
165+ elif method == "gaussian" :
166+ cols = df1 .columns .tolist ()
167+ df_1 = StandardScaler ().fit_transform (df1 [df_mask .any (axis = 1 )])
168+ df_2 = StandardScaler ().fit_transform (df2 [df_mask .any (axis = 1 )])
169+
170+ n = df_1 .shape [0 ]
171+ mu_true = np .nanmean (df_1 , axis = 0 )
172+ sigma_true = np .ma .cov (np .ma .masked_invalid (df_1 ), rowvar = False ).data
173+ mu_pred = np .nanmean (df_2 , axis = 0 )
174+ sigma_pred = np .ma .cov (np .ma .masked_invalid (df_2 ), rowvar = False ).data
175+ diff = mu_true - mu_pred
176+ inv_sigma_pred = np .linalg .inv (sigma_pred )
177+ quad_term = diff .T @ inv_sigma_pred @ diff
178+ trace_term = np .trace (inv_sigma_pred @ sigma_true )
179+ det_term = np .log (np .linalg .det (sigma_pred ) / np .linalg .det (sigma_true ))
180+ kl = 0.5 * (quad_term + trace_term + det_term - n )
181+ return pd .Series (kl , index = cols )
182+ else :
183+ raise AssertionError (
184+ f"The parameter of the function wasserstein_distance should be one of"
185+ f"the following: [`columnwise`, `gaussian`], not `{ method } `!"
186+ )
193187
194188
195189def _get_numerical_features (df1 : pd .DataFrame ) -> List [str ]:
@@ -242,7 +236,7 @@ def _get_categorical_features(df1: pd.DataFrame) -> List[str]:
242236 return cols_categorical
243237
244238
245- def _kolmogorov_smirnov_test (df1 : pd .Series , df2 : pd .Series ) -> float :
239+ def kolmogorov_smirnov_test_1D (df1 : pd .Series , df2 : pd .Series ) -> float :
246240 """Compute KS test statistic of the two-sample Kolmogorov-Smirnov test for goodness of fit.
247241 See more in https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ks_2samp.html.
248242
@@ -283,11 +277,14 @@ def kolmogorov_smirnov_test(
283277 """
284278 cols_numerical = _get_numerical_features (df1 )
285279 return columnwise_metric (
286- df1 [cols_numerical ], df2 [cols_numerical ], df_mask [cols_numerical ], _kolmogorov_smirnov_test
280+ df1 [cols_numerical ],
281+ df2 [cols_numerical ],
282+ df_mask [cols_numerical ],
283+ kolmogorov_smirnov_test_1D ,
287284 )
288285
289286
290- def _total_variance_distance (df1 : pd .Series , df2 : pd .Series ) -> float :
287+ def total_variance_distance_1D (df1 : pd .Series , df2 : pd .Series ) -> float :
291288 """Compute Total Variance Distance for a categorical feature
292289 It is based on TVComplement in https://github.com/sdv-dev/SDMetrics
293290
@@ -337,7 +334,7 @@ def total_variance_distance(
337334 df1 [cols_categorical ],
338335 df2 [cols_categorical ],
339336 df_mask [cols_categorical ],
340- _total_variance_distance ,
337+ total_variance_distance_1D ,
341338 )
342339
343340
@@ -564,20 +561,20 @@ def mean_difference_correlation_matrix_categorical_vs_numerical_features(
564561
565562
566563###########################
567- # Row-wise metris #
564+ # Row-wise metrics #
568565###########################
569566
570567
571- def _sum_distance_col ( col : pd .Series , col_size : int ) -> float :
572- col = col .sort_values (ascending = True )
573- sums_partial = col .shift ().fillna (0.0 ).cumsum ()
574- differences_partial = col * np .arange (col_size ) - sums_partial
568+ def _sum_manhattan_distances_1D ( values : pd .Series ) -> float :
569+ values = values .sort_values (ascending = True )
570+ sums_partial = values .shift ().fillna (0.0 ).cumsum ()
571+ differences_partial = values * np .arange (len ( values ) ) - sums_partial
575572 res = differences_partial .sum ()
576573 return res
577574
578575
579576def _sum_manhattan_distances (df1 : pd .DataFrame ) -> float :
580- """Sum Manhattan distances.
577+ """Sum Manhattan distances beetween all pairs of rows .
581578 It is based on https://www.geeksforgeeks.org/sum-manhattan-distances-pairs-points/
582579
583580 Parameters
@@ -586,10 +583,8 @@ def _sum_manhattan_distances(df1: pd.DataFrame) -> float:
586583 _description_
587584 """
588585 cols = df1 .columns .tolist ()
589- sum = 0.0
590- for col in cols :
591- sum += _sum_distance_col (df1 [col ], len (df1 ))
592- return sum
586+ result = sum ([_sum_manhattan_distances_1D (df1 [col ]) for col in cols ])
587+ return result
593588
594589
595590def sum_energy_distances (df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame ) -> pd .Series :
@@ -613,9 +608,8 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
613608 df1 = df1 [df_mask ].fillna (0.0 )
614609 df2 = df2 [df_mask ].fillna (0.0 )
615610
616- sum_distances_df1 = _sum_manhattan_distances (
617- df1
618- ) # sum of (len_df1 * (len_df1 - 1) / 2) distances for df1
611+ # sum of (len_df1 * (len_df1 - 1) / 2) distances for df1
612+ sum_distances_df1 = _sum_manhattan_distances (df1 )
619613 sum_distances_df2 = _sum_manhattan_distances (df2 )
620614
621615 df = pd .concat ([df1 , df2 ])
@@ -654,7 +648,7 @@ def sum_pairwise_distances(
654648
655649
656650###########################
657- # Dataframe-wise metris #
651+ # Dataframe-wise metrics #
658652###########################
659653
660654
0 commit comments