1616def columnwise_metric (
1717 df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame , metric : Callable , ** kwargs
1818) -> pd .Series :
19+ """For each column, compute a metric score based on the true dataframe and the predicted dataframe
20+
21+ Parameters
22+ ----------
23+ df1 : pd.DataFrame
24+ True dataframe
25+ df2 : pd.DataFrame
26+ Predicted dataframe
27+ df_mask : pd.DataFrame
28+ Elements of the dataframes to compute on
29+ metric : Callable
30+ metric function
31+
32+ Returns
33+ -------
34+ pd.Series
35+ Series of scores for all columns
36+ """
1937 values = {}
2038 for col in df1 .columns :
2139 df1_col = df1 .loc [df_mask [col ], col ]
@@ -99,7 +117,7 @@ def weighted_mean_absolute_percentage_error(
99117
100118 Returns
101119 -------
102- Union[float, pd.Series]
120+ pd.Series
103121 """
104122 return columnwise_metric (df1 , df2 , df_mask , skm .mean_absolute_percentage_error )
105123
@@ -121,7 +139,8 @@ def wasserstein_distance(
121139
122140 Returns
123141 -------
124- wasserstein distances : pd.Series
142+ pd.Series
143+ wasserstein distances
125144 """
126145 if method == "columnwise" :
127146 return columnwise_metric (df1 , df2 , df_mask , scipy .stats .wasserstein_distance )
@@ -284,7 +303,7 @@ def kolmogorov_smirnov_test(
284303 )
285304
286305
287- def total_variance_distance_1D (df1 : pd .Series , df2 : pd .Series ) -> float :
306+ def _total_variance_distance_1D (df1 : pd .Series , df2 : pd .Series ) -> float :
288307 """Compute Total Variance Distance for a categorical feature
289308 It is based on TVComplement in https://github.com/sdv-dev/SDMetrics
290309
@@ -297,8 +316,8 @@ def total_variance_distance_1D(df1: pd.Series, df2: pd.Series) -> float:
297316
298317 Returns
299318 -------
300- _type_
301- _description_
319+ float
320+ Total variance distance
302321 """
303322 list_categories = list (set (df1 .unique ()).union (set (df2 .unique ())))
304323 freqs1 = df1 .value_counts () / len (df1 )
@@ -334,7 +353,7 @@ def total_variance_distance(
334353 df1 [cols_categorical ],
335354 df2 [cols_categorical ],
336355 df_mask [cols_categorical ],
337- total_variance_distance_1D ,
356+ _total_variance_distance_1D ,
338357 )
339358
340359
@@ -566,6 +585,19 @@ def mean_difference_correlation_matrix_categorical_vs_numerical_features(
566585
567586
568587def _sum_manhattan_distances_1D (values : pd .Series ) -> float :
588+ """Sum of Manhattan distances computed for one column
589+ It is based on https://www.geeksforgeeks.org/sum-manhattan-distances-pairs-points/
590+
591+ Parameters
592+ ----------
593+ values : pd.Series
594+ Values of a column
595+
596+ Returns
597+ -------
598+ float
599+ Sum of Manhattan distances
600+ """
569601 values = values .sort_values (ascending = True )
570602 sums_partial = values .shift ().fillna (0.0 ).cumsum ()
571603 differences_partial = values * np .arange (len (values )) - sums_partial
@@ -574,13 +606,17 @@ def _sum_manhattan_distances_1D(values: pd.Series) -> float:
574606
575607
576608def _sum_manhattan_distances (df1 : pd .DataFrame ) -> float :
577- """Sum Manhattan distances beetween all pairs of rows.
609+ """Sum Manhattan distances between all pairs of rows.
578610 It is based on https://www.geeksforgeeks.org/sum-manhattan-distances-pairs-points/
579611
580612 Parameters
581613 ----------
582- df : pd.DataFrame
583- _description_
614+ df1 : pd.DataFrame
615+
616+ Returns
617+ -------
618+ float
619+ Sum of Manhattan distances for all pairs of rows.
584620 """
585621 cols = df1 .columns .tolist ()
586622 result = sum ([_sum_manhattan_distances_1D (df1 [col ]) for col in cols ])
@@ -596,12 +632,14 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
596632 df1 : pd.DataFrame
597633 true dataframe
598634 df2 : pd.DataFrame
599- _description_
635+ predicted dataframe
636+ df_mask : pd.DataFrame
637+ Elements of the dataframes to compute on
600638
601639 Returns
602640 -------
603- _type_
604- _description_
641+ pd.Series
642+ Sum of energy distances between df1 and df2.
605643 """
606644
607645 # Replace nan in dataframe
@@ -622,7 +660,9 @@ def sum_energy_distances(df1: pd.DataFrame, df2: pd.DataFrame, df_mask: pd.DataF
622660def sum_pairwise_distances (
623661 df1 : pd .DataFrame , df2 : pd .DataFrame , df_mask : pd .DataFrame , metric : str = "cityblock"
624662) -> pd .Series :
625- """Sum of pairwise distances based on a predefined metric
663+ """Sum of pairwise distances based on a predefined metric.
664+ Metrics are found in this link
665+ https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.cdist.html
626666
627667 Parameters
628668 ----------
@@ -635,8 +675,8 @@ def sum_pairwise_distances(
635675
636676 Returns
637677 -------
638- _type_
639- _description_
678+ pd.Series
679+ Sum of pairwise distances based on a predefined metric
640680 """
641681 distances = np .sum (
642682 scipy .spatial .distance .cdist (
0 commit comments