Skip to content

Commit bb556e8

Browse files
committed
feat: add kl_divergence unit tests
1 parent caae9a3 commit bb556e8

File tree

2 files changed

+97
-65
lines changed

2 files changed

+97
-65
lines changed

qolmat/benchmark/metrics.py

Lines changed: 65 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -125,71 +125,71 @@ def wasser_distance(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame)
125125
return columnwise_metric(df1, df2, df_mask, scipy.stats.wasserstein_distance)
126126

127127

128-
# def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
129-
# min_val = min(df1.min(), df2.min())
130-
# max_val = max(df1.max(), df2.max())
131-
# bins = np.linspace(min_val, max_val, 20)
132-
# p = np.histogram(df1, bins=bins, density=True)[0]
133-
# q = np.histogram(df2, bins=bins, density=True)[0]
134-
# return scipy.stats.entropy(p + EPS, q + EPS)
135-
136-
137-
# def kl_divergence_columnwise(
138-
# df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
139-
# ) -> pd.Series:
140-
# """TODO documentation
141-
# Kullback-Leibler divergence between distributions
142-
# If multivariate normal distributions:
143-
# https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
144-
145-
# Parameters
146-
# ----------
147-
# df1 : pd.DataFrame
148-
# df2 : pd.DataFrame
149-
# columnwise_evaluation: Optional[bool]
150-
# if the evalutation is computed column-wise. By default, is set to False
151-
152-
# Returns
153-
# -------
154-
# Kullback-Leibler divergence : Union[float, pd.Series]
155-
# """
156-
157-
# return columnwise_metric(df1, df2, df_mask, kl_divergence_1D)
158-
159-
160-
# def kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series:
161-
# """TODO Documentation
162-
# Kullback-Leibler divergence between distributions
163-
# If multivariate normal distributions:
164-
# https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
165-
166-
# Parameters
167-
# ----------
168-
# df1 : pd.DataFrame
169-
# df2 : pd.DataFrame
170-
# columnwise_evaluation: Optional[bool]
171-
# if the evalutation is computed column-wise. By default, is set to False
172-
173-
# Returns
174-
# -------
175-
# Kullback-Leibler divergence : Union[float, pd.Series]
176-
# """
177-
# cols = df1.columns.tolist()
178-
# df_1 = StandardScaler().fit_transform(df1[df_mask.any(axis=1)])
179-
# df_2 = StandardScaler().fit_transform(df2[df_mask.any(axis=1)])
180-
181-
# n = df_1.shape[0]
182-
# mu_true = np.nanmean(df_1, axis=0)
183-
# sigma_true = np.ma.cov(np.ma.masked_invalid(df_1), rowvar=False).data
184-
# mu_pred = np.nanmean(df_2, axis=0)
185-
# sigma_pred = np.ma.cov(np.ma.masked_invalid(df_2), rowvar=False).data
186-
# diff = mu_true - mu_pred
187-
# inv_sigma_pred = np.linalg.inv(sigma_pred)
188-
# quad_term = diff.T @ inv_sigma_pred @ diff
189-
# trace_term = np.trace(inv_sigma_pred @ sigma_true)
190-
# det_term = np.log(np.linalg.det(sigma_pred) / np.linalg.det(sigma_true))
191-
# kl = 0.5 * (quad_term + trace_term + det_term - n)
192-
# return pd.Series(kl, index=cols)
128+
def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
129+
min_val = min(df1.min(), df2.min())
130+
max_val = max(df1.max(), df2.max())
131+
bins = np.linspace(min_val, max_val, 20)
132+
p = np.histogram(df1, bins=bins, density=True)[0]
133+
q = np.histogram(df2, bins=bins, density=True)[0]
134+
return scipy.stats.entropy(p + EPS, q + EPS)
135+
136+
137+
def kl_divergence_columnwise(
138+
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
139+
) -> pd.Series:
140+
"""TODO documentation
141+
Kullback-Leibler divergence between distributions
142+
If multivariate normal distributions:
143+
https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
144+
145+
Parameters
146+
----------
147+
df1 : pd.DataFrame
148+
df2 : pd.DataFrame
149+
columnwise_evaluation: Optional[bool]
150+
if the evalutation is computed column-wise. By default, is set to False
151+
152+
Returns
153+
-------
154+
Kullback-Leibler divergence : Union[float, pd.Series]
155+
"""
156+
157+
return columnwise_metric(df1, df2, df_mask, kl_divergence_1D)
158+
159+
160+
def kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series:
161+
"""TODO Documentation
162+
Kullback-Leibler divergence between distributions
163+
If multivariate normal distributions:
164+
https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
165+
166+
Parameters
167+
----------
168+
df1 : pd.DataFrame
169+
df2 : pd.DataFrame
170+
columnwise_evaluation: Optional[bool]
171+
if the evalutation is computed column-wise. By default, is set to False
172+
173+
Returns
174+
-------
175+
Kullback-Leibler divergence : Union[float, pd.Series]
176+
"""
177+
cols = df1.columns.tolist()
178+
df_1 = StandardScaler().fit_transform(df1[df_mask.any(axis=1)])
179+
df_2 = StandardScaler().fit_transform(df2[df_mask.any(axis=1)])
180+
181+
n = df_1.shape[0]
182+
mu_true = np.nanmean(df_1, axis=0)
183+
sigma_true = np.ma.cov(np.ma.masked_invalid(df_1), rowvar=False).data
184+
mu_pred = np.nanmean(df_2, axis=0)
185+
sigma_pred = np.ma.cov(np.ma.masked_invalid(df_2), rowvar=False).data
186+
diff = mu_true - mu_pred
187+
inv_sigma_pred = np.linalg.inv(sigma_pred)
188+
quad_term = diff.T @ inv_sigma_pred @ diff
189+
trace_term = np.trace(inv_sigma_pred @ sigma_true)
190+
det_term = np.log(np.linalg.det(sigma_pred) / np.linalg.det(sigma_true))
191+
kl = 0.5 * (quad_term + trace_term + det_term - n)
192+
return pd.Series(kl, index=cols)
193193

194194

195195
def _get_numerical_features(df1: pd.DataFrame) -> List[str]:

tests/benchmark/test_metrics.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,38 @@ def test_wasser_distance(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
9292
)
9393

9494

95+
@pytest.mark.parametrize("df1", [df_incomplete])
96+
@pytest.mark.parametrize("df2", [df_imputed])
97+
@pytest.mark.parametrize("df_mask", [df_mask])
98+
def test_kl_divergence_columnwise(
99+
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
100+
) -> None:
101+
assert metrics.kl_divergence_columnwise(df1, df1, df_mask).equals(
102+
pd.Series([0.0, 0.0], index=["col1", "col2"])
103+
)
104+
assert (
105+
metrics.kl_divergence_columnwise(df1, df2, df_mask)
106+
.round(3)
107+
.equals(pd.Series([18.945, 36.637], index=["col1", "col2"]))
108+
)
109+
110+
111+
@pytest.mark.parametrize("df1", [df_incomplete])
112+
@pytest.mark.parametrize("df2", [df_imputed])
113+
@pytest.mark.parametrize("df_mask", [df_mask])
114+
def test_kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> None:
115+
assert (
116+
metrics.kl_divergence(df1, df1, df_mask)
117+
.round(2)
118+
.equals(pd.Series([-0.5, -0.5], index=["col1", "col2"]))
119+
)
120+
assert (
121+
metrics.kl_divergence(df1, df2, df_mask)
122+
.round(3)
123+
.equals(pd.Series([0.263, 0.263], index=["col1", "col2"]))
124+
)
125+
126+
95127
@pytest.mark.parametrize("df1", [df_incomplete])
96128
@pytest.mark.parametrize("df2", [df_imputed])
97129
@pytest.mark.parametrize("df_mask", [df_mask])

0 commit comments

Comments
 (0)