Skip to content

Commit cb43460

Browse files
committed
feat: add docstring
1 parent 1771a34 commit cb43460

File tree

1 file changed

+55
-15
lines changed

1 file changed

+55
-15
lines changed

qolmat/benchmark/metrics.py

Lines changed: 55 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,24 @@
1616
def columnwise_metric(
1717
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame, metric: Callable, **kwargs
1818
) -> pd.Series:
19+
"""For each column, compute a metric score based on the true dataframe and the predicted dataframe
20+
21+
Parameters
22+
----------
23+
df1 : pd.DataFrame
24+
True dataframe
25+
df2 : pd.DataFrame
26+
Predicted dataframe
27+
df_mask : pd.DataFrame
28+
Elements of the dataframes to compute on
29+
metric : Callable
30+
metric function
31+
32+
Returns
33+
-------
34+
pd.Series
35+
Series of scores for all columns
36+
"""
1937
values = {}
2038
for col in df1.columns:
2139
df1_col = df1.loc[df_mask[col], col]
@@ -99,7 +117,7 @@ def weighted_mean_absolute_percentage_error(
99117
100118
Returns
101119
-------
102-
Union[float, pd.Series]
120+
pd.Series
103121
"""
104122
return columnwise_metric(df1, df2, df_mask, skm.mean_absolute_percentage_error)
105123

@@ -121,7 +139,8 @@ def wasserstein_distance(
121139
122140
Returns
123141
-------
124-
wasserstein distances : pd.Series
142+
pd.Series
143+
wasserstein distances
125144
"""
126145
if method == "columnwise":
127146
return columnwise_metric(df1, df2, df_mask, scipy.stats.wasserstein_distance)
@@ -284,7 +303,7 @@ def kolmogorov_smirnov_test(
284303
)
285304

286305

287-
def total_variance_distance_1D(df1: pd.Series, df2: pd.Series) -> float:
306+
def _total_variance_distance_1D(df1: pd.Series, df2: pd.Series) -> float:
288307
"""Compute Total Variance Distance for a categorical feature
289308
It is based on TVComplement in https://github.com/sdv-dev/SDMetrics
290309
@@ -297,8 +316,8 @@ def total_variance_distance_1D(df1: pd.Series, df2: pd.Series) -> float:
297316
298317
Returns
299318
-------
300-
_type_
301-
_description_
319+
float
320+
Total variance distance
302321
"""
303322
list_categories = list(set(df1.unique()).union(set(df2.unique())))
304323
freqs1 = df1.value_counts() / len(df1)
@@ -334,7 +353,7 @@ def total_variance_distance(
334353
df1[cols_categorical],
335354
df2[cols_categorical],
336355
df_mask[cols_categorical],
337-
total_variance_distance_1D,
356+
_total_variance_distance_1D,
338357
)
339358

340359

@@ -566,6 +585,19 @@ def mean_difference_correlation_matrix_categorical_vs_numerical_features(
566585

567586

568587
def _sum_manhattan_distances_1D(values: pd.Series) -> float:
588+
"""Sum of Manhattan distances computed for one column
589+
It is based on https://www.geeksforgeeks.org/sum-manhattan-distances-pairs-points/
590+
591+
Parameters
592+
----------
593+
values : pd.Series
594+
Values of a column
595+
596+
Returns
597+
-------
598+
float
599+
Sum of Manhattan distances
600+
"""
569601
values = values.sort_values(ascending=True)
570602
sums_partial = values.shift().fillna(0.0).cumsum()
571603
differences_partial = values * np.arange(len(values)) - sums_partial
@@ -574,13 +606,17 @@ def _sum_manhattan_distances_1D(values: pd.Series) -> float:
574606

575607

576608
def _sum_manhattan_distances(df1: pd.DataFrame) -> float:
577-
"""Sum Manhattan distances beetween all pairs of rows.
609+
"""Sum Manhattan distances between all pairs of rows.
578610
It is based on https://www.geeksforgeeks.org/sum-manhattan-distances-pairs-points/
579611
580612
Parameters
581613
----------
582-
df : pd.DataFrame
583-
_description_
614+
df1 : pd.DataFrame
615+
616+
Returns
617+
-------
618+
float
619+
Sum of Manhattan distances for all pairs of rows.
584620
"""
585621
cols = df1.columns.tolist()
586622
result = sum([_sum_manhattan_distances_1D(df1[col]) for col in cols])
@@ -596,12 +632,14 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
596632
df1 : pd.DataFrame
597633
true dataframe
598634
df2 : pd.DataFrame
599-
_description_
635+
predicted dataframe
636+
df_mask : pd.DataFrame
637+
Elements of the dataframes to compute on
600638
601639
Returns
602640
-------
603-
_type_
604-
_description_
641+
pd.Series
642+
Sum of energy distances between df1 and df2.
605643
"""
606644

607645
# Replace nan in dataframe
@@ -622,7 +660,9 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
622660
def sum_pairwise_distances(
623661
df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataFrame, metric: str = "cityblock"
624662
) -> pd.Series:
625-
"""Sum of pairwise distances based on a predefined metric
663+
"""Sum of pairwise distances based on a predefined metric.
664+
Metrics are found in this link
665+
https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html
626666
627667
Parameters
628668
----------
@@ -635,8 +675,8 @@ def sum_pairwise_distances(
635675
636676
Returns
637677
-------
638-
_type_
639-
_description_
678+
pd.Series
679+
Sum of pairwise distances based on a predefined metric
640680
"""
641681
distances = np.sum(
642682
scipy.spatial.distance.cdist(

0 commit comments

Comments
 (0)