Skip to content

Commit 268f39f

Browse files
Merge pull request #28 from Quantmetry/angoho_benchmarks_dev
[Draft] Angoho benchmarks dev
2 parents 564ead2 + 5b371cb commit 268f39f

File tree

1 file changed

+57
-16
lines changed

1 file changed

+57
-16
lines changed

qolmat/benchmark/metrics.py

Lines changed: 57 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Callable, Dict, List, Optional, Union
1+
from typing import Callable, List, Optional
22

33
import numpy as np
44
import pandas as pd
@@ -17,6 +17,25 @@
1717
def columnwise_metric(
1818
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame, metric: Callable, **kwargs
1919
) -> pd.Series:
20+
"""For each column, compute a metric score based on the true dataframe
21+
and the predicted dataframe
22+
23+
Parameters
24+
----------
25+
df1 : pd.DataFrame
26+
True dataframe
27+
df2 : pd.DataFrame
28+
Predicted dataframe
29+
df_mask : pd.DataFrame
30+
Elements of the dataframes to compute on
31+
metric : Callable
32+
metric function
33+
34+
Returns
35+
-------
36+
pd.Series
37+
Series of scores for all columns
38+
"""
2039
values = {}
2140
for col in df1.columns:
2241
df1_col = df1.loc[df_mask[col], col]
@@ -102,7 +121,7 @@ def weighted_mean_absolute_percentage_error(
102121
103122
Returns
104123
-------
105-
Union[float, pd.Series]
124+
pd.Series
106125
"""
107126
return columnwise_metric(df1, df2, df_mask, skm.mean_absolute_percentage_error)
108127

@@ -124,7 +143,8 @@ def wasserstein_distance(
124143
125144
Returns
126145
-------
127-
wasserstein distances : pd.Series
146+
pd.Series
147+
wasserstein distances
128148
"""
129149
if method == "columnwise":
130150
return columnwise_metric(df1, df2, df_mask, scipy.stats.wasserstein_distance)
@@ -319,7 +339,7 @@ def kolmogorov_smirnov_test(
319339
)
320340

321341

322-
def total_variance_distance_1D(df1: pd.Series, df2: pd.Series) -> float:
342+
def _total_variance_distance_1D(df1: pd.Series, df2: pd.Series) -> float:
323343
"""Compute Total Variance Distance for a categorical feature
324344
It is based on TVComplement in https://github.com/sdv-dev/SDMetrics
325345
@@ -332,8 +352,8 @@ def total_variance_distance_1D(df1: pd.Series, df2: pd.Series) -> float:
332352
333353
Returns
334354
-------
335-
_type_
336-
_description_
355+
float
356+
Total variance distance
337357
"""
338358
list_categories = list(set(df1.unique()).union(set(df2.unique())))
339359
freqs1 = df1.value_counts() / len(df1)
@@ -368,7 +388,7 @@ def total_variance_distance(
368388
df1[cols_categorical],
369389
df2[cols_categorical],
370390
df_mask[cols_categorical],
371-
total_variance_distance_1D,
391+
_total_variance_distance_1D,
372392
)
373393

374394

@@ -600,6 +620,19 @@ def mean_difference_correlation_matrix_categorical_vs_numerical_features(
600620

601621

602622
def _sum_manhattan_distances_1D(values: pd.Series) -> float:
623+
"""Sum of Manhattan distances computed for one column
624+
It is based on https://www.geeksforgeeks.org/sum-manhattan-distances-pairs-points/
625+
626+
Parameters
627+
----------
628+
values : pd.Series
629+
Values of a column
630+
631+
Returns
632+
-------
633+
float
634+
Sum of Manhattan distances
635+
"""
603636
values = values.sort_values(ascending=True)
604637
sums_partial = values.shift().fillna(0.0).cumsum()
605638
differences_partial = values * np.arange(len(values)) - sums_partial
@@ -608,13 +641,17 @@ def _sum_manhattan_distances_1D(values: pd.Series) -> float:
608641

609642

610643
def _sum_manhattan_distances(df1: pd.DataFrame) -> float:
611-
"""Sum Manhattan distances beetween all pairs of rows.
644+
"""Sum Manhattan distances between all pairs of rows.
612645
It is based on https://www.geeksforgeeks.org/sum-manhattan-distances-pairs-points/
613646
614647
Parameters
615648
----------
616-
df : pd.DataFrame
617-
_description_
649+
df1 : pd.DataFrame
650+
651+
Returns
652+
-------
653+
float
654+
Sum of Manhattan distances for all pairs of rows.
618655
"""
619656
cols = df1.columns.tolist()
620657
result = sum([_sum_manhattan_distances_1D(df1[col]) for col in cols])
@@ -630,12 +667,14 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
630667
df1 : pd.DataFrame
631668
true dataframe
632669
df2 : pd.DataFrame
633-
_description_
670+
predicted dataframe
671+
df_mask : pd.DataFrame
672+
Elements of the dataframes to compute on
634673
635674
Returns
636675
-------
637-
_type_
638-
_description_
676+
pd.Series
677+
Sum of energy distances between df1 and df2.
639678
"""
640679

641680
# Replace nan in dataframe
@@ -656,7 +695,9 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
656695
def sum_pairwise_distances(
657696
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame, metric: str = "cityblock"
658697
) -> pd.Series:
659-
"""Sum of pairwise distances based on a predefined metric
698+
"""Sum of pairwise distances based on a predefined metric.
699+
Metrics are found in this link
700+
https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html
660701
661702
Parameters
662703
----------
@@ -669,8 +710,8 @@ def sum_pairwise_distances(
669710
670711
Returns
671712
-------
672-
_type_
673-
_description_
713+
pd.Series
714+
Sum of pairwise distances based on a predefined metric
674715
"""
675716
distances = np.sum(
676717
scipy.spatial.distance.cdist(

0 commit comments

Comments
 (0)