@@ -801,6 +801,39 @@ def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> float:
801801 return scipy .stats .entropy (p + EPS , q + EPS )
802802
803803
804+ def kl_divergence_gaussian_exact (
805+ mean1 : pd .Series , cov1 : pd .DataFrame , mean2 : pd .Series , cov2 : pd .DataFrame
806+ ) -> float :
807+ """Exact Kullback-Leibler divergence computed between two multivariate normal distributions
808+
809+ Parameters
810+ ----------
811+ mean1: pd.Series
812+ Mean of the first distribution
813+ cov1: pd.DataFrame
814+ Covariance matrx of the first distribution
815+ mean2: pd.Series
816+ Mean of the second distribution
817+ cov2: pd.DataFrame
818+ Covariance matrx of the second distribution
819+ Returns
820+ -------
821+ float
822+ Kulback-Leibler divergence
823+ """
824+ n_variables = len (mean1 )
825+ L1 , lower1 = scipy .linalg .cho_factor (cov1 )
826+ L2 , lower2 = scipy .linalg .cho_factor (cov2 )
827+ M = scipy .linalg .solve (L2 , L1 )
828+ y = scipy .linalg .solve (L2 , mean2 - mean1 )
829+ norm_M = (M ** 2 ).sum ().sum ()
830+ norm_y = (y ** 2 ).sum ()
831+ term_diag_L = 2 * np .sum (np .log (np .diagonal (L2 ) / np .diagonal (L1 )))
832+ print (norm_M , "-" , n_variables , "+" , norm_y , "+" , term_diag_L )
833+ div_kl = 0.5 * (norm_M - n_variables + norm_y + term_diag_L )
834+ return div_kl
835+
836+
804837def kl_divergence_gaussian (df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .Series ) -> float :
805838 """Kullback-Leibler divergence estimation based on a Gaussian approximation of both empirical
806839 distributions
@@ -821,20 +854,12 @@ def kl_divergence_gaussian(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.Ser
821854 """
822855 df1 = df1 [df_mask .any (axis = 1 )]
823856 df2 = df2 [df_mask .any (axis = 1 )]
824- n_variables = len (df1 .columns )
825857 cov1 = df1 .cov ()
826858 cov2 = df2 .cov ()
827859 mean1 = df1 .mean ()
828860 mean2 = df2 .mean ()
829- L1 , lower1 = scipy .linalg .cho_factor (cov1 )
830- L2 , lower2 = scipy .linalg .cho_factor (cov2 )
831- M = scipy .linalg .solve (L2 , L1 )
832- y = scipy .linalg .solve (L2 , mean2 - mean1 )
833- norm_M = (M ** 2 ).sum ().sum ()
834- norm_y = (y ** 2 ).sum ()
835- term_diag_L = 2 * np .sum (np .log (np .diagonal (L2 ) / np .diagonal (L1 )))
836- print (norm_M , "-" , n_variables , "+" , norm_y , "+" , term_diag_L )
837- div_kl = 0.5 * (norm_M - n_variables + norm_y + term_diag_L )
861+
862+ div_kl = kl_divergence_gaussian_exact (mean1 , cov1 , mean2 , cov2 )
838863 return div_kl
839864
840865
@@ -1017,6 +1042,10 @@ def pattern_based_weighted_mean_metric(
10171042 scores .append (metric (df1_pattern , df2_pattern , df_mask_pattern , ** kwargs ))
10181043 if len (scores ) == 0 :
10191044 raise NotEnoughSamples (max_num_row , min_n_rows )
1045+ print ("scores:" )
1046+ print (scores )
1047+ print ("weights:" )
1048+ print (weights )
10201049 return pd .Series (sum ([s * w for s , w in zip (scores , weights )]), index = ["All" ])
10211050
10221051
0 commit comments