@@ -125,71 +125,71 @@ def wasser_distance(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame)
125125 return columnwise_metric (df1 , df2 , df_mask , scipy .stats .wasserstein_distance )
126126
127127
128- # def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
129- # min_val = min(df1.min(), df2.min())
130- # max_val = max(df1.max(), df2.max())
131- # bins = np.linspace(min_val, max_val, 20)
132- # p = np.histogram(df1, bins=bins, density=True)[0]
133- # q = np.histogram(df2, bins=bins, density=True)[0]
134- # return scipy.stats.entropy(p + EPS, q + EPS)
135-
136-
137- # def kl_divergence_columnwise(
138- # df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
139- # ) -> pd.Series:
140- # """TODO documentation
141- # Kullback-Leibler divergence between distributions
142- # If multivariate normal distributions:
143- # https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
144-
145- # Parameters
146- # ----------
147- # df1 : pd.DataFrame
148- # df2 : pd.DataFrame
149- # columnwise_evaluation: Optional[bool]
150- # if the evalutation is computed column-wise. By default, is set to False
151-
152- # Returns
153- # -------
154- # Kullback-Leibler divergence : Union[float, pd.Series]
155- # """
156-
157- # return columnwise_metric(df1, df2, df_mask, kl_divergence_1D)
158-
159-
160- # def kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series:
161- # """TODO Documentation
162- # Kullback-Leibler divergence between distributions
163- # If multivariate normal distributions:
164- # https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
165-
166- # Parameters
167- # ----------
168- # df1 : pd.DataFrame
169- # df2 : pd.DataFrame
170- # columnwise_evaluation: Optional[bool]
171- # if the evalutation is computed column-wise. By default, is set to False
172-
173- # Returns
174- # -------
175- # Kullback-Leibler divergence : Union[float, pd.Series]
176- # """
177- # cols = df1.columns.tolist()
178- # df_1 = StandardScaler().fit_transform(df1[df_mask.any(axis=1)])
179- # df_2 = StandardScaler().fit_transform(df2[df_mask.any(axis=1)])
180-
181- # n = df_1.shape[0]
182- # mu_true = np.nanmean(df_1, axis=0)
183- # sigma_true = np.ma.cov(np.ma.masked_invalid(df_1), rowvar=False).data
184- # mu_pred = np.nanmean(df_2, axis=0)
185- # sigma_pred = np.ma.cov(np.ma.masked_invalid(df_2), rowvar=False).data
186- # diff = mu_true - mu_pred
187- # inv_sigma_pred = np.linalg.inv(sigma_pred)
188- # quad_term = diff.T @ inv_sigma_pred @ diff
189- # trace_term = np.trace(inv_sigma_pred @ sigma_true)
190- # det_term = np.log(np.linalg.det(sigma_pred) / np.linalg.det(sigma_true))
191- # kl = 0.5 * (quad_term + trace_term + det_term - n)
192- # return pd.Series(kl, index=cols)
128+ def kl_divergence_1D (df1 : pd .Series , df2 : pd .Series ) -> np .number :
129+ min_val = min (df1 .min (), df2 .min ())
130+ max_val = max (df1 .max (), df2 .max ())
131+ bins = np .linspace (min_val , max_val , 20 )
132+ p = np .histogram (df1 , bins = bins , density = True )[0 ]
133+ q = np .histogram (df2 , bins = bins , density = True )[0 ]
134+ return scipy .stats .entropy (p + EPS , q + EPS )
135+
136+
137+ def kl_divergence_columnwise (
138+ df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame
139+ ) -> pd .Series :
140+ """TODO documentation
141+ Kullback-Leibler divergence between distributions
142+ If multivariate normal distributions:
143+ https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
144+
145+ Parameters
146+ ----------
147+ df1 : pd.DataFrame
148+ df2 : pd.DataFrame
149+ columnwise_evaluation: Optional[bool]
150+ if the evalutation is computed column-wise. By default, is set to False
151+
152+ Returns
153+ -------
154+ Kullback-Leibler divergence : Union[float, pd.Series]
155+ """
156+
157+ return columnwise_metric (df1 , df2 , df_mask , kl_divergence_1D )
158+
159+
160+ def kl_divergence (df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame ) -> pd .Series :
161+ """TODO Documentation
162+ Kullback-Leibler divergence between distributions
163+ If multivariate normal distributions:
164+ https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
165+
166+ Parameters
167+ ----------
168+ df1 : pd.DataFrame
169+ df2 : pd.DataFrame
170+ columnwise_evaluation: Optional[bool]
171+ if the evalutation is computed column-wise. By default, is set to False
172+
173+ Returns
174+ -------
175+ Kullback-Leibler divergence : Union[float, pd.Series]
176+ """
177+ cols = df1 .columns .tolist ()
178+ df_1 = StandardScaler ().fit_transform (df1 [df_mask .any (axis = 1 )])
179+ df_2 = StandardScaler ().fit_transform (df2 [df_mask .any (axis = 1 )])
180+
181+ n = df_1 .shape [0 ]
182+ mu_true = np .nanmean (df_1 , axis = 0 )
183+ sigma_true = np .ma .cov (np .ma .masked_invalid (df_1 ), rowvar = False ).data
184+ mu_pred = np .nanmean (df_2 , axis = 0 )
185+ sigma_pred = np .ma .cov (np .ma .masked_invalid (df_2 ), rowvar = False ).data
186+ diff = mu_true - mu_pred
187+ inv_sigma_pred = np .linalg .inv (sigma_pred )
188+ quad_term = diff .T @ inv_sigma_pred @ diff
189+ trace_term = np .trace (inv_sigma_pred @ sigma_true )
190+ det_term = np .log (np .linalg .det (sigma_pred ) / np .linalg .det (sigma_true ))
191+ kl = 0.5 * (quad_term + trace_term + det_term - n )
192+ return pd .Series (kl , index = cols )
193193
194194
195195def _get_numerical_features (df1 : pd .DataFrame ) -> List [str ]:
0 commit comments