@@ -125,71 +125,71 @@ def wasser_distance(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame)
125125 return columnwise_metric (df1 , df2 , df_mask , scipy .stats .wasserstein_distance )
126126
127127
128- def kl_divergence_1D (df1 : pd .Series , df2 : pd .Series ) -> np .number :
129- min_val = min (df1 .min (), df2 .min ())
130- max_val = max (df1 .max (), df2 .max ())
131- bins = np .linspace (min_val , max_val , 20 )
132- p = np .histogram (df1 , bins = bins , density = True )[0 ]
133- q = np .histogram (df2 , bins = bins , density = True )[0 ]
134- return scipy .stats .entropy (p + EPS , q + EPS )
135-
136-
137- def kl_divergence_columnwise (
138- df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame
139- ) -> pd .Series :
140- """TODO documentation
141- Kullback-Leibler divergence between distributions
142- If multivariate normal distributions:
143- https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
144-
145- Parameters
146- ----------
147- df1 : pd.DataFrame
148- df2 : pd.DataFrame
149- columnwise_evaluation: Optional[bool]
150- if the evalutation is computed column-wise. By default, is set to False
151-
152- Returns
153- -------
154- Kullback-Leibler divergence : Union[float, pd.Series]
155- """
156-
157- return columnwise_metric (df1 , df2 , df_mask , kl_divergence_1D )
158-
159-
160- def kl_divergence (df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame ) -> pd .Series :
161- """TODO Documentation
162- Kullback-Leibler divergence between distributions
163- If multivariate normal distributions:
164- https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
165-
166- Parameters
167- ----------
168- df1 : pd.DataFrame
169- df2 : pd.DataFrame
170- columnwise_evaluation: Optional[bool]
171- if the evalutation is computed column-wise. By default, is set to False
172-
173- Returns
174- -------
175- Kullback-Leibler divergence : Union[float, pd.Series]
176- """
177- cols = df1 .columns .tolist ()
178- df_1 = StandardScaler ().fit_transform (df1 [df_mask .any (axis = 1 )])
179- df_2 = StandardScaler ().fit_transform (df2 [df_mask .any (axis = 1 )])
180-
181- n = df_1 .shape [0 ]
182- mu_true = np .nanmean (df_1 , axis = 0 )
183- sigma_true = np .ma .cov (np .ma .masked_invalid (df_1 ), rowvar = False ).data
184- mu_pred = np .nanmean (df_2 , axis = 0 )
185- sigma_pred = np .ma .cov (np .ma .masked_invalid (df_2 ), rowvar = False ).data
186- diff = mu_true - mu_pred
187- inv_sigma_pred = np .linalg .inv (sigma_pred )
188- quad_term = diff .T @ inv_sigma_pred @ diff
189- trace_term = np .trace (inv_sigma_pred @ sigma_true )
190- det_term = np .log (np .linalg .det (sigma_pred ) / np .linalg .det (sigma_true ))
191- kl = 0.5 * (quad_term + trace_term + det_term - n )
192- return pd .Series (kl , index = cols )
128+ # def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
129+ # min_val = min(df1.min(), df2.min())
130+ # max_val = max(df1.max(), df2.max())
131+ # bins = np.linspace(min_val, max_val, 20)
132+ # p = np.histogram(df1, bins=bins, density=True)[0]
133+ # q = np.histogram(df2, bins=bins, density=True)[0]
134+ # return scipy.stats.entropy(p + EPS, q + EPS)
135+
136+
137+ # def kl_divergence_columnwise(
138+ # df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
139+ # ) -> pd.Series:
140+ # """TODO documentation
141+ # Kullback-Leibler divergence between distributions
142+ # If multivariate normal distributions:
143+ # https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
144+
145+ # Parameters
146+ # ----------
147+ # df1 : pd.DataFrame
148+ # df2 : pd.DataFrame
149+ # columnwise_evaluation: Optional[bool]
150+ # if the evalutation is computed column-wise. By default, is set to False
151+
152+ # Returns
153+ # -------
154+ # Kullback-Leibler divergence : Union[float, pd.Series]
155+ # """
156+
157+ # return columnwise_metric(df1, df2, df_mask, kl_divergence_1D)
158+
159+
160+ # def kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series:
161+ # """TODO Documentation
162+ # Kullback-Leibler divergence between distributions
163+ # If multivariate normal distributions:
164+ # https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
165+
166+ # Parameters
167+ # ----------
168+ # df1 : pd.DataFrame
169+ # df2 : pd.DataFrame
170+ # columnwise_evaluation: Optional[bool]
171+ # if the evalutation is computed column-wise. By default, is set to False
172+
173+ # Returns
174+ # -------
175+ # Kullback-Leibler divergence : Union[float, pd.Series]
176+ # """
177+ # cols = df1.columns.tolist()
178+ # df_1 = StandardScaler().fit_transform(df1[df_mask.any(axis=1)])
179+ # df_2 = StandardScaler().fit_transform(df2[df_mask.any(axis=1)])
180+
181+ # n = df_1.shape[0]
182+ # mu_true = np.nanmean(df_1, axis=0)
183+ # sigma_true = np.ma.cov(np.ma.masked_invalid(df_1), rowvar=False).data
184+ # mu_pred = np.nanmean(df_2, axis=0)
185+ # sigma_pred = np.ma.cov(np.ma.masked_invalid(df_2), rowvar=False).data
186+ # diff = mu_true - mu_pred
187+ # inv_sigma_pred = np.linalg.inv(sigma_pred)
188+ # quad_term = diff.T @ inv_sigma_pred @ diff
189+ # trace_term = np.trace(inv_sigma_pred @ sigma_true)
190+ # det_term = np.log(np.linalg.det(sigma_pred) / np.linalg.det(sigma_true))
191+ # kl = 0.5 * (quad_term + trace_term + det_term - n)
192+ # return pd.Series(kl, index=cols)
193193
194194
195195def _get_numerical_features (df1 : pd .DataFrame ) -> List [str ]:
@@ -341,6 +341,11 @@ def total_variance_distance(
341341 )
342342
343343
344+ def _check_same_number_columns (df1 : pd .DataFrame , df2 : pd .DataFrame ):
345+ if len (df1 .columns ) != len (df2 .columns ):
346+ raise Exception ("inputs have to have the same number of columns." )
347+
348+
344349def _get_correlation_pearson_matrix (df : pd .DataFrame , use_p_value : bool = True ) -> pd .DataFrame :
345350 """Get matrix of correlation values for numerical features
346351 based on Pearson correlation coefficient or p-value for testing non-correlation.
@@ -400,8 +405,7 @@ def mean_difference_correlation_matrix_numerical_features(
400405 df1 = df1 [df_mask ].dropna (axis = 0 )
401406 df2 = df2 [df_mask ].dropna (axis = 0 )
402407
403- if len (df1 .columns ) != len (df2 .columns ):
404- raise Exception ("inputs have to have the same number of columns." )
408+ _check_same_number_columns (df1 , df2 )
405409
406410 cols_numerical = _get_numerical_features (df1 )
407411 df_corr1 = _get_correlation_pearson_matrix (df1 [cols_numerical ], use_p_value = use_p_value )
@@ -470,8 +474,7 @@ def mean_difference_correlation_matrix_categorical_features(
470474 df1 = df1 [df_mask ].dropna (axis = 0 )
471475 df2 = df2 [df_mask ].dropna (axis = 0 )
472476
473- if len (df1 .columns ) != len (df2 .columns ):
474- raise Exception ("inputs have to have the same number of columns." )
477+ _check_same_number_columns (df1 , df2 )
475478
476479 cols_categorical = _get_categorical_features (df1 )
477480 df_corr1 = _get_correlation_chi2_matrix (df1 [cols_categorical ], use_p_value = use_p_value )
@@ -510,14 +513,11 @@ def _get_correlation_f_oneway_matrix(
510513 for idx_cat , col_cat in enumerate (cols_categorical ):
511514 for idx_num , col_num in enumerate (cols_numerical ):
512515 category_group_lists = df .groupby (col_cat )[col_num ].apply (list )
513- try :
514- res = scipy .stats .f_oneway (* category_group_lists )
515- if use_p_value :
516- matrix [idx_cat , idx_num ] = res [1 ]
517- else :
518- matrix [idx_cat , idx_num ] = res [0 ]
519- except ValueError :
520- matrix [idx_cat , idx_num ] = 0.0
516+ res = scipy .stats .f_oneway (* category_group_lists )
517+ if use_p_value :
518+ matrix [idx_cat , idx_num ] = res [1 ]
519+ else :
520+ matrix [idx_cat , idx_num ] = res [0 ]
521521 return pd .DataFrame (matrix , index = cols_categorical , columns = cols_numerical )
522522
523523
@@ -549,8 +549,7 @@ def mean_difference_correlation_matrix_categorical_vs_numerical_features(
549549 df1 = df1 [df_mask ].dropna (axis = 0 )
550550 df2 = df2 [df_mask ].dropna (axis = 0 )
551551
552- if len (df1 .columns ) != len (df2 .columns ):
553- raise Exception ("inputs have to have the same number of columns." )
552+ _check_same_number_columns (df1 , df2 )
554553
555554 cols_categorical = _get_categorical_features (df1 )
556555 cols_numerical = _get_numerical_features (df1 )
@@ -664,7 +663,7 @@ def frechet_distance(
664663 df2 : pd .DataFrame ,
665664 df_mask : pd .DataFrame ,
666665 normalized : Optional [bool ] = False ,
667- ) -> float :
666+ ) -> pd . Series :
668667 """Compute the Fréchet distance between two dataframes df1 and df2
669668 frechet_distance = || mu_1 - mu_2 ||_2^2 + Tr(Sigma_1 + Sigma_2 - 2(Sigma_1 . Sigma_2)^(1/2))
670669 if normalized, df1 and df_ are first scaled by a factor
0 commit comments