Skip to content

Commit caae9a3

Browse files
committed
feat: add unit tests
1 parent 7403db6 commit caae9a3

File tree

3 files changed

+383
-299
lines changed

3 files changed

+383
-299
lines changed

qolmat/benchmark/metrics.py

Lines changed: 79 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -125,71 +125,71 @@ def wasser_distance(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame)
125125
return columnwise_metric(df1, df2, df_mask, scipy.stats.wasserstein_distance)
126126

127127

128-
def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
129-
min_val = min(df1.min(), df2.min())
130-
max_val = max(df1.max(), df2.max())
131-
bins = np.linspace(min_val, max_val, 20)
132-
p = np.histogram(df1, bins=bins, density=True)[0]
133-
q = np.histogram(df2, bins=bins, density=True)[0]
134-
return scipy.stats.entropy(p + EPS, q + EPS)
135-
136-
137-
def kl_divergence_columnwise(
138-
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
139-
) -> pd.Series:
140-
"""TODO documentation
141-
Kullback-Leibler divergence between distributions
142-
If multivariate normal distributions:
143-
https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
144-
145-
Parameters
146-
----------
147-
df1 : pd.DataFrame
148-
df2 : pd.DataFrame
149-
columnwise_evaluation: Optional[bool]
150-
if the evalutation is computed column-wise. By default, is set to False
151-
152-
Returns
153-
-------
154-
Kullback-Leibler divergence : Union[float, pd.Series]
155-
"""
156-
157-
return columnwise_metric(df1, df2, df_mask, kl_divergence_1D)
158-
159-
160-
def kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series:
161-
"""TODO Documentation
162-
Kullback-Leibler divergence between distributions
163-
If multivariate normal distributions:
164-
https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
165-
166-
Parameters
167-
----------
168-
df1 : pd.DataFrame
169-
df2 : pd.DataFrame
170-
columnwise_evaluation: Optional[bool]
171-
if the evalutation is computed column-wise. By default, is set to False
172-
173-
Returns
174-
-------
175-
Kullback-Leibler divergence : Union[float, pd.Series]
176-
"""
177-
cols = df1.columns.tolist()
178-
df_1 = StandardScaler().fit_transform(df1[df_mask.any(axis=1)])
179-
df_2 = StandardScaler().fit_transform(df2[df_mask.any(axis=1)])
180-
181-
n = df_1.shape[0]
182-
mu_true = np.nanmean(df_1, axis=0)
183-
sigma_true = np.ma.cov(np.ma.masked_invalid(df_1), rowvar=False).data
184-
mu_pred = np.nanmean(df_2, axis=0)
185-
sigma_pred = np.ma.cov(np.ma.masked_invalid(df_2), rowvar=False).data
186-
diff = mu_true - mu_pred
187-
inv_sigma_pred = np.linalg.inv(sigma_pred)
188-
quad_term = diff.T @ inv_sigma_pred @ diff
189-
trace_term = np.trace(inv_sigma_pred @ sigma_true)
190-
det_term = np.log(np.linalg.det(sigma_pred) / np.linalg.det(sigma_true))
191-
kl = 0.5 * (quad_term + trace_term + det_term - n)
192-
return pd.Series(kl, index=cols)
128+
# def kl_divergence_1D(df1: pd.Series, df2: pd.Series) -> np.number:
129+
# min_val = min(df1.min(), df2.min())
130+
# max_val = max(df1.max(), df2.max())
131+
# bins = np.linspace(min_val, max_val, 20)
132+
# p = np.histogram(df1, bins=bins, density=True)[0]
133+
# q = np.histogram(df2, bins=bins, density=True)[0]
134+
# return scipy.stats.entropy(p + EPS, q + EPS)
135+
136+
137+
# def kl_divergence_columnwise(
138+
# df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame
139+
# ) -> pd.Series:
140+
# """TODO documentation
141+
# Kullback-Leibler divergence between distributions
142+
# If multivariate normal distributions:
143+
# https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
144+
145+
# Parameters
146+
# ----------
147+
# df1 : pd.DataFrame
148+
# df2 : pd.DataFrame
149+
# columnwise_evaluation: Optional[bool]
150+
# if the evalutation is computed column-wise. By default, is set to False
151+
152+
# Returns
153+
# -------
154+
# Kullback-Leibler divergence : Union[float, pd.Series]
155+
# """
156+
157+
# return columnwise_metric(df1, df2, df_mask, kl_divergence_1D)
158+
159+
160+
# def kl_divergence(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame) -> pd.Series:
161+
# """TODO Documentation
162+
# Kullback-Leibler divergence between distributions
163+
# If multivariate normal distributions:
164+
# https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence
165+
166+
# Parameters
167+
# ----------
168+
# df1 : pd.DataFrame
169+
# df2 : pd.DataFrame
170+
# columnwise_evaluation: Optional[bool]
171+
# if the evalutation is computed column-wise. By default, is set to False
172+
173+
# Returns
174+
# -------
175+
# Kullback-Leibler divergence : Union[float, pd.Series]
176+
# """
177+
# cols = df1.columns.tolist()
178+
# df_1 = StandardScaler().fit_transform(df1[df_mask.any(axis=1)])
179+
# df_2 = StandardScaler().fit_transform(df2[df_mask.any(axis=1)])
180+
181+
# n = df_1.shape[0]
182+
# mu_true = np.nanmean(df_1, axis=0)
183+
# sigma_true = np.ma.cov(np.ma.masked_invalid(df_1), rowvar=False).data
184+
# mu_pred = np.nanmean(df_2, axis=0)
185+
# sigma_pred = np.ma.cov(np.ma.masked_invalid(df_2), rowvar=False).data
186+
# diff = mu_true - mu_pred
187+
# inv_sigma_pred = np.linalg.inv(sigma_pred)
188+
# quad_term = diff.T @ inv_sigma_pred @ diff
189+
# trace_term = np.trace(inv_sigma_pred @ sigma_true)
190+
# det_term = np.log(np.linalg.det(sigma_pred) / np.linalg.det(sigma_true))
191+
# kl = 0.5 * (quad_term + trace_term + det_term - n)
192+
# return pd.Series(kl, index=cols)
193193

194194

195195
def _get_numerical_features(df1: pd.DataFrame) -> List[str]:
@@ -341,6 +341,11 @@ def total_variance_distance(
341341
)
342342

343343

344+
def _check_same_number_columns(df1: pd.DataFrame, df2: pd.DataFrame):
345+
if len(df1.columns) != len(df2.columns):
346+
raise Exception("inputs have to have the same number of columns.")
347+
348+
344349
def _get_correlation_pearson_matrix(df: pd.DataFrame, use_p_value: bool = True) -> pd.DataFrame:
345350
"""Get matrix of correlation values for numerical features
346351
based on Pearson correlation coefficient or p-value for testing non-correlation.
@@ -400,8 +405,7 @@ def mean_difference_correlation_matrix_numerical_features(
400405
df1 = df1[df_mask].dropna(axis=0)
401406
df2 = df2[df_mask].dropna(axis=0)
402407

403-
if len(df1.columns) != len(df2.columns):
404-
raise Exception("inputs have to have the same number of columns.")
408+
_check_same_number_columns(df1, df2)
405409

406410
cols_numerical = _get_numerical_features(df1)
407411
df_corr1 = _get_correlation_pearson_matrix(df1[cols_numerical], use_p_value=use_p_value)
@@ -470,8 +474,7 @@ def mean_difference_correlation_matrix_categorical_features(
470474
df1 = df1[df_mask].dropna(axis=0)
471475
df2 = df2[df_mask].dropna(axis=0)
472476

473-
if len(df1.columns) != len(df2.columns):
474-
raise Exception("inputs have to have the same number of columns.")
477+
_check_same_number_columns(df1, df2)
475478

476479
cols_categorical = _get_categorical_features(df1)
477480
df_corr1 = _get_correlation_chi2_matrix(df1[cols_categorical], use_p_value=use_p_value)
@@ -510,14 +513,11 @@ def _get_correlation_f_oneway_matrix(
510513
for idx_cat, col_cat in enumerate(cols_categorical):
511514
for idx_num, col_num in enumerate(cols_numerical):
512515
category_group_lists = df.groupby(col_cat)[col_num].apply(list)
513-
try:
514-
res = scipy.stats.f_oneway(*category_group_lists)
515-
if use_p_value:
516-
matrix[idx_cat, idx_num] = res[1]
517-
else:
518-
matrix[idx_cat, idx_num] = res[0]
519-
except ValueError:
520-
matrix[idx_cat, idx_num] = 0.0
516+
res = scipy.stats.f_oneway(*category_group_lists)
517+
if use_p_value:
518+
matrix[idx_cat, idx_num] = res[1]
519+
else:
520+
matrix[idx_cat, idx_num] = res[0]
521521
return pd.DataFrame(matrix, index=cols_categorical, columns=cols_numerical)
522522

523523

@@ -549,8 +549,7 @@ def mean_difference_correlation_matrix_categorical_vs_numerical_features(
549549
df1 = df1[df_mask].dropna(axis=0)
550550
df2 = df2[df_mask].dropna(axis=0)
551551

552-
if len(df1.columns) != len(df2.columns):
553-
raise Exception("inputs have to have the same number of columns.")
552+
_check_same_number_columns(df1, df2)
554553

555554
cols_categorical = _get_categorical_features(df1)
556555
cols_numerical = _get_numerical_features(df1)
@@ -664,7 +663,7 @@ def frechet_distance(
664663
df2: pd.DataFrame,
665664
df_mask: pd.DataFrame,
666665
normalized: Optional[bool] = False,
667-
) -> float:
666+
) -> pd.Series:
668667
"""Compute the Fréchet distance between two dataframes df1 and df2
669668
frechet_distance = || mu_1 - mu_2 ||_2^2 + Tr(Sigma_1 + Sigma_2 - 2(Sigma_1 . Sigma_2)^(1/2))
670669
if normalized, df1 and df_ are first scaled by a factor

qolmat/tests/test_benchmark/test_metrics.py

Lines changed: 0 additions & 125 deletions
This file was deleted.

0 commit comments

Comments
 (0)