1- from typing import Callable , Dict , List , Optional , Union
1+ from typing import Callable , List , Optional
22
33import numpy as np
44import pandas as pd
1717def columnwise_metric (
1818 df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame , metric : Callable , ** kwargs
1919) -> pd .Series :
20+ """For each column, compute a metric score based on the true dataframe
21+ and the predicted dataframe
22+
23+ Parameters
24+ ----------
25+ df1 : pd.DataFrame
26+ True dataframe
27+ df2 : pd.DataFrame
28+ Predicted dataframe
29+ df_mask : pd.DataFrame
30+ Elements of the dataframes to compute on
31+ metric : Callable
32+ metric function
33+
34+ Returns
35+ -------
36+ pd.Series
37+ Series of scores for all columns
38+ """
2039 values = {}
2140 for col in df1 .columns :
2241 df1_col = df1 .loc [df_mask [col ], col ]
@@ -102,7 +121,7 @@ def weighted_mean_absolute_percentage_error(
102121
103122 Returns
104123 -------
105- Union[float, pd.Series]
124+ pd.Series
106125 """
107126 return columnwise_metric (df1 , df2 , df_mask , skm .mean_absolute_percentage_error )
108127
@@ -124,7 +143,8 @@ def wasserstein_distance(
124143
125144 Returns
126145 -------
127- wasserstein distances : pd.Series
146+ pd.Series
147+ wasserstein distances
128148 """
129149 if method == "columnwise" :
130150 return columnwise_metric (df1 , df2 , df_mask , scipy .stats .wasserstein_distance )
@@ -319,7 +339,7 @@ def kolmogorov_smirnov_test(
319339 )
320340
321341
322- def total_variance_distance_1D (df1 : pd .Series , df2 : pd .Series ) -> float :
342+ def _total_variance_distance_1D (df1 : pd .Series , df2 : pd .Series ) -> float :
323343 """Compute Total Variance Distance for a categorical feature
324344 It is based on TVComplement in https://github.com/sdv-dev/SDMetrics
325345
@@ -332,8 +352,8 @@ def total_variance_distance_1D(df1: pd.Series, df2: pd.Series) -> float:
332352
333353 Returns
334354 -------
335- _type_
336- _description_
355+ float
356+ Total variance distance
337357 """
338358 list_categories = list (set (df1 .unique ()).union (set (df2 .unique ())))
339359 freqs1 = df1 .value_counts () / len (df1 )
@@ -368,7 +388,7 @@ def total_variance_distance(
368388 df1 [cols_categorical ],
369389 df2 [cols_categorical ],
370390 df_mask [cols_categorical ],
371- total_variance_distance_1D ,
391+ _total_variance_distance_1D ,
372392 )
373393
374394
@@ -600,6 +620,19 @@ def mean_difference_correlation_matrix_categorical_vs_numerical_features(
600620
601621
602622def _sum_manhattan_distances_1D (values : pd .Series ) -> float :
623+ """Sum of Manhattan distances computed for one column
624+ It is based on https://www.geeksforgeeks.org/sum-manhattan-distances-pairs-points/
625+
626+ Parameters
627+ ----------
628+ values : pd.Series
629+ Values of a column
630+
631+ Returns
632+ -------
633+ float
634+ Sum of Manhattan distances
635+ """
603636 values = values .sort_values (ascending = True )
604637 sums_partial = values .shift ().fillna (0.0 ).cumsum ()
605638 differences_partial = values * np .arange (len (values )) - sums_partial
@@ -608,13 +641,17 @@ def _sum_manhattan_distances_1D(values: pd.Series) -> float:
608641
609642
610643def _sum_manhattan_distances (df1 : pd .DataFrame ) -> float :
611- """Sum Manhattan distances beetween all pairs of rows.
644+ """Sum Manhattan distances between all pairs of rows.
612645 It is based on https://www.geeksforgeeks.org/sum-manhattan-distances-pairs-points/
613646
614647 Parameters
615648 ----------
616- df : pd.DataFrame
617- _description_
649+ df1 : pd.DataFrame
650+
651+ Returns
652+ -------
653+ float
654+ Sum of Manhattan distances for all pairs of rows.
618655 """
619656 cols = df1 .columns .tolist ()
620657 result = sum ([_sum_manhattan_distances_1D (df1 [col ]) for col in cols ])
@@ -630,12 +667,14 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
630667 df1 : pd.DataFrame
631668 true dataframe
632669 df2 : pd.DataFrame
633- _description_
670+ predicted dataframe
671+ df_mask : pd.DataFrame
672+ Elements of the dataframes to compute on
634673
635674 Returns
636675 -------
637- _type_
638- _description_
676+ pd.Series
677+ Sum of energy distances between df1 and df2.
639678 """
640679
641680 # Replace nan in dataframe
@@ -656,7 +695,9 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
656695def sum_pairwise_distances (
657696 df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame , metric : str = "cityblock"
658697) -> pd .Series :
659- """Sum of pairwise distances based on a predefined metric
698+ """Sum of pairwise distances based on a predefined metric.
699+ Metrics are found in this link
700+ https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html
660701
661702 Parameters
662703 ----------
@@ -669,8 +710,8 @@ def sum_pairwise_distances(
669710
670711 Returns
671712 -------
672- _type_
673- _description_
713+ pd.Series
714+ Sum of pairwise distances based on a predefined metric
674715 """
675716 distances = np .sum (
676717 scipy .spatial .distance .cdist (
0 commit comments