dataiku-research · simonamaggio · Apr 14, 2021 · Apr 14, 2021 · Apr 14, 2021 · Apr 14, 2021
diff --git a/mealy/error_analysis_utils.py b/mealy/error_analysis_utils.py
@@ -6,14 +6,15 @@
 
 
 def get_epsilon(difference):
-    """
-    Compute the threshold used to decide whether a prediction is wrong or correct (for regression tasks).
+    """Compute the threshold used to decide whether a prediction is wrong or correct (for regression tasks).
 
     Args:
-           difference (1D-array): The absolute differences between the true target values and the predicted ones (by the primary model).
+        difference (numpy.ndarray): The absolute differences between the true target values and the predicted ones
+            (by the primary model).
 
-    Return:
-           epsilon (float): The value of the threshold used to decide whether the prediction for a regression task is wrong or correct
+    Returns:
+        float: The value of the threshold used to decide whether the prediction for a regression task
+        is wrong or correct.
     """
     epsilon_range = np.linspace(min(difference), max(difference), num=ErrorAnalyzerConstants.NUMBER_EPSILON_VALUES)
     cdf_error = []
@@ -23,7 +24,18 @@ def get_epsilon(difference):
         cdf_error.append(np.count_nonzero(correct_predictions) / float(n_samples))
     return KneeLocator(epsilon_range, cdf_error).knee
 
+
 def get_feature_list_from_column_transformer(ct_preprocessor):
+    """Get list of feature names and categorical feature names from a ColumnTransformer preprocessor.
+
+    Args:
+        ct_preprocessor (sklearn.compose.ColumnTransformer): ColumnTransformer containing separate feature
+            preprocessing steps.
+
+    Returns:
+        all_features (list): list of feature names.
+        categorical_features (list): list of categorical feature names.
+    """
     all_features, categorical_features = [], []
     for transformer_name, transformer, transformer_feature_names in ct_preprocessor.transformers_:
         if transformer_name == 'remainder' and transformer == 'drop':
@@ -42,16 +54,21 @@ def get_feature_list_from_column_transformer(ct_preprocessor):
 
 
 def check_lists_having_same_elements(list_A, list_B):
+    """Check two lists have the same unique elements."""
     return set(list_A) == set(list_B)
 
+
 def check_enough_data(df, min_len):
-    """
-    Compare length of dataframe to minimum lenght of the test data.
+    """Compare length of dataframe to minimum length of the test data.
+
     Used in the relevance of the measure.
 
-    :param df: Input dataframe
-    :param min_len:
-    :return:
+    Args:
+        df (pandas.DataFrame): Input dataframe
+        min_len (int): Minimum number of rows required.
+
+    Raises:
+      ValueError: If df does not have the required minimum number of rows.
     """
     if df.shape[0] < min_len:
         raise ValueError(

diff --git a/mealy/error_analyzer.py b/mealy/error_analyzer.py
@@ -22,21 +22,28 @@
 
 
 class ErrorAnalyzer(BaseEstimator):
-    """ ErrorAnalyzer analyzes the errors of a prediction model on a test set.
+    """ErrorAnalyzer analyzes the errors of a prediction model on a test set.
 
     It uses model predictions and ground truth target to compute the model errors on the test set.
     It then trains a Decision Tree, called a Error Analyzer Tree, on the same test set by using the model error
     as target. The nodes of the decision tree are different segments of errors to be studied individually.
 
     Args:
-        primary_model (sklearn.base.BaseEstimator or sklearn.pipeline.Pipeline): a sklearn model to analyze. Either an estimator
+        primary_model (sklearn.base.BaseEstimator or sklearn.pipeline.Pipeline): A sklearn model to analyze. Either an estimator
             or a Pipeline containing a ColumnTransformer with the preprocessing steps and an estimator as last step.
-        feature_names (list of str): list of feature names. Defaults to None.
-        param_grid (dict): sklearn.tree.DecisionTree hyper-parameters values for grid search.
-        random_state (int): random seed.
+        feature_names (list): List of feature names, default=None.
+        param_grid (dict): The sklearn.tree.DecisionTree hyper-parameters values for grid search.
+        random_state (int): Random seed.
 
     Attributes:
-        _error_tree (DecisionTreeClassifier): the estimator used to train the Error Analyzer Tree
+        _primary_model (sklearn.base.BaseEstimator or sklearn.pipeline.Pipeline): A sklearn model to analyze. Either an estimator
+            or a Pipeline containing a ColumnTransformer with the preprocessing steps and an estimator as last step.
+        feature_names (list): List of feature names.
+        param_grid (dict): The sklearn.tree.DecisionTreeClassifier hyper-parameters values for grid search.
+        random_state (int): Random seed.
+        preprocessed_feature_names (list): List of preprocessed feature names.
+        error_tree (ErrorTree): the Error Tree.
+        epsilon (float): The threshold used to assess if a prediction is wrong, only used for regression primary models.
     """
 
     def __init__(self, primary_model,
@@ -106,16 +113,15 @@ def preprocessed_feature_names(self):
         return self.pipeline_preprocessor.get_preprocessed_feature_names()
 
     def fit(self, X, y):
-        """
-        Fit the Error Analyzer Tree.
+        """Fit the Error Analyzer Tree.
 
         Trains the Error Analyzer Tree, a Decision Tree to discriminate between samples that are correctly
         predicted or wrongly predicted (errors) by a primary model.
 
         Args:
-            X (numpy.ndarray or pandas.DataFrame): feature data from a test set to evaluate the primary predictor and
+            X (numpy.ndarray or pandas.DataFrame): Feature data from a test set to evaluate the primary predictor and
                 train a Error Analyzer Tree.
-            y (numpy.ndarray or pandas.DataFrame): target data from a test set to evaluate the primary predictor and
+            y (numpy.ndarray or pandas.DataFrame): Target data from a test set to evaluate the primary predictor and
                 train a Error Analyzer Tree.
         """
         logger.info("Preparing the Error Analyzer Tree...")
@@ -159,15 +165,15 @@ def get_error_leaf_summary(self, leaf_selector=None, add_path_to_leaves=False,
                 * array-like: Only return information of the leaves corresponding to these ids
                 * None (default): Return information of all the leaves
             add_path_to_leaves (bool): Whether to add information of the path across the tree till the selected node. Defaults to False.
-            output_format (string): Return format used for the report. Valid values are 'dict' or 'str'. Defaults to 'dict'.
+            output_format (str): Return format used for the report. Valid values are 'dict' or 'str'. Defaults to 'dict'.
             rank_by (str): Ranking criterion for the leaves. Valid values are:
                 * 'total_error_fraction' (default): rank by the fraction of total error in the node
                 * 'purity': rank by the purity (ratio of wrongly predicted samples over the total number of node samples)
                 * 'class_difference': rank by the difference of number of wrongly and correctly predicted samples
                 in a node.
 
-        Return:
-            dict or str: list of reports (as dictionary or string) with different information on each selected leaf.
+        Returns:
+            dict or str: List of reports (as dictionary or string) with different information on each selected leaf.
         """
 
         leaf_nodes = self._get_ranked_leaf_ids(leaf_selector=leaf_selector, rank_by=rank_by)
@@ -214,37 +220,31 @@ def evaluate(self, X, y, output_format='str'):
         Return ErrorAnalyzer summary metrics regarding the Error Tree.
 
         Args:
-            X (numpy.ndarray or pandas.DataFrame): feature data from a test set to evaluate the primary predictor
+            X (numpy.ndarray or pandas.DataFrame): Feature data from a test set to evaluate the primary predictor
                 and train a Error Analyzer Tree.
-            y (numpy.ndarray or pandas.DataFrame): target data from a test set to evaluate the primary predictor and
+            y (numpy.ndarray or pandas.DataFrame): Target data from a test set to evaluate the primary predictor and
                 train a Error Analyzer Tree.
-            output_format (string): Return format used for the report. Valid values are 'dict' or 'str'. Defaults to 'str'.
+            output_format (str): Return format used for the report. Valid values are 'dict' or 'str'. Defaults to 'str'.
 
-        Return:
-            dict or str: dictionary or string report storing different metrics regarding the Error Decision Tree.
+        Returns:
+            dict or str: Dictionary or string report storing different metrics regarding the Error Tree.
         """
         prep_x, prep_y = self.pipeline_preprocessor.transform(X), np.array(y)
         y_pred = self.error_tree.estimator_.predict(prep_x)
         y_true, _ = self._compute_primary_model_error(prep_x, prep_y)
         return error_decision_tree_report(y_true, y_pred, output_format)
 
     def _compute_primary_model_error(self, X, y):
-        """
-        Computes the errors of the primary model predictions and samples
+        """Computes the errors of the primary model predictions and samples.
 
         Args:
-            X: array-like of shape (n_samples, n_features)
-            Input samples.
-
-            y: array-like of shape (n_samples,)
-            True target values for `X`.
+            X (numpy.ndarray):  Input samples of shape `(n_samples, n_features)`.
+            y (numpy.ndarray):  True target values for `X` of shape `(n_samples,)`.
 
         Returns:
-             sampled_X: ndarray
-             A sample of `X`.
-
-             error_y: array of string of shape (n_sampled_X, )
-             Boolean value of whether or not the primary model predicted correctly or incorrectly the samples in sampled_X.
+             error_y (numpy.ndarray): Array of booleans of shape `(len(y),)`, containing a boolean value of whether or
+                not the primary model got the prediction right.
+             error_rate (float): Accuracy of the primary model.
         """
         if is_regressor(self._primary_model) or len(np.unique(y)) > 2:
             # regression or multiclass classification models: no proba threshold
@@ -258,22 +258,16 @@ def _compute_primary_model_error(self, X, y):
         return error_y, error_rate
 
     def _evaluate_primary_model_predictions(self, y_true, y_pred):
-        """
-        Compute errors of the primary model on the test set
+        """Compute errors of the primary model on the test set.
 
         Args:
-            y_true: 1D array
-            True target values.
-
-            y_pred: 1D array
-            Predictions of the primary model.
+            y_true (numpy.ndarray): True target values.
+            y_pred (numpy.ndarray): Predictions of the primary model.
 
-        Return:
-            error_y: array of string of len(y_true)
-            Boolean value of whether or not the primary model got the prediction right.
-
-            error_rate: float
-            Accuracy of the primary model
+        Returns:
+             error_y (numpy.ndarray): Array of booleans of shape `(len(y),)`, containing a boolean value of whether or
+                not the primary model got the prediction right.
+             error_rate (float): Accuracy of the primary model.
         """
 
         error_y = np.full_like(y_true, ErrorAnalyzerConstants.CORRECT_PREDICTION, dtype="O")
@@ -300,18 +294,18 @@ def _get_ranked_leaf_ids(self, leaf_selector=None, rank_by='total_error_fraction
         """ Select error nodes and rank them by importance.
 
         Args:
-            leaf_selector (None, int or array-like): the leaves whose information will be returned
+            leaf_selector (None, int or array-like): The leaves whose information will be returned
                 * int: Only return information of the leaf with the corresponding id
                 * array-like: Only return information of the leaves corresponding to these ids
                 * None (default): Return information of all the leaves
-            rank_by (str): ranking criterion for the leaves. Valid values are:
+            rank_by (str): Ranking criterion for the leaves. Valid values are:
                 * 'total_error_fraction': rank by the fraction of total error in the node
                 * 'purity': rank by the purity (ratio of wrongly predicted samples over the total number of node samples)
                 * 'class_difference': rank by the difference of number of wrongly and correctly predicted samples
                 in a node.
 
-        Return:
-            list or numpy.ndarray: list of selected leaves indices.
+        Returns:
+            list or numpy.ndarray: List of selected leaves indices.
 
         """
         apply_leaf_selector = self._get_leaf_selector(leaf_selector)
@@ -331,20 +325,20 @@ def _get_ranked_leaf_ids(self, leaf_selector=None, rank_by='total_error_fraction
 
     #TODO leaf_selector is taking too many different types of data ?
     def _get_leaf_selector(self, leaf_selector):
-        """
-        Return a function that select rows of provided arrays. Arrays must be of shape (1, number of leaves)
-            Args:
-                leaf_selector: None, int or array-like
-                    How to select the rows of the array
-                      * int: Only keep the row corresponding to this leaf id
-                      * array-like: Only keep the rows corresponding to these leaf ids
-                      * None (default): Keep the whole array of leaf ids
-
-            Return:
-                A function with one argument array as a selector of leaf ids
+        """Return a function that select rows of provided arrays.
+
+        Arrays must be of shape (1, number of leaves).
+
+        Args:
+            leaf_selector (None, int or array-like): How to select the rows of the array
+                * int: Only keep the row corresponding to this leaf id
+                * array-like: Only keep the rows corresponding to these leaf ids
+                * None (default): Keep the whole array of leaf ids
+
+        Returns:
+            A function with one argument `array` as a selector of leaf ids.
                 Args:
-                    array: numpy array of shape (1, number of leaves)
-                    An array of which we only want to keep some rows
+                    array (numpy.array): array of shape (1, number of leaves) of which we only want to keep some rows.
         """
         if leaf_selector is None:
             return lambda array: array
@@ -359,7 +353,14 @@ def _get_leaf_selector(self, leaf_selector):
         return lambda array: array[leaf_selector]
 
     def _get_path_to_node(self, node_id):
-        """ Return path to node as a list of split steps from the nodes of the sklearn Tree object """
+        """ Return path to node as a list of split steps from the nodes of the sklearn Tree object.
+
+        Args:
+            node_id (int): Node identifier.
+
+        Returns:
+            path_to_node (str): Text describing the path from the root to the input node.
+        """
         feature_names = self.pipeline_preprocessor.get_original_feature_names()
         children_left = list(self.error_tree.estimator_.tree_.children_left)
         children_right = list(self.error_tree.estimator_.tree_.children_right)
@@ -404,7 +405,7 @@ def _inverse_transform_features(self):
         indicate what features are used to split the training set at each node.
         See https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html.
 
-        Return:
+        Returns:
             list or numpy.ndarray:
                 indices of features of the Error Analyzer Tree, possibly mapped back to the
                 original unprocessed feature space.
@@ -421,7 +422,7 @@ def _inverse_transform_thresholds(self):
         the decision tree. The thresholds of a decision tree are the feature values used to split the training set at
         each node. See https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html.
 
-        Return:
+        Returns:
             numpy.ndarray:
                 thresholds of the Error Tree, possibly with preprocessing undone.
         """

diff --git a/mealy/error_tree.py b/mealy/error_tree.py
@@ -6,8 +6,32 @@
 logger = logging.getLogger(__name__)
 logging.basicConfig(level=logging.INFO, format='mealy | %(levelname)s - %(message)s')
 
+
 class ErrorTree(object):
+    """ ErrorTree analyzes the errors of a prediction model on a test set.
+
+    It uses model predictions and ground truth target to compute the model errors on the test set.
+    It then trains a Decision Tree, called a Error Analyzer Tree, on the same test set by using the model error
+    as target. The nodes of the decision tree are different segments of errors to be studied individually.
+
+    Args:
+        error_decision_tree (sklearn.tree.DecisionTreeClassifier): The estimator used to train the Error Tree.
+
+    Attributes:
+        estimator_ (sklearn.tree.DecisionTreeClassifier): The estimator used to train the Error Tree.
+        impurity (numpy.ndarray): Impurity of leaves.
+        quantized_impurity (numpy.ndarray): Impurity of leaves quantized into ErrorAnalyzerConstants.NUMBER_PURITY_LEVELS
+            levels.
+        difference (numpy.ndarray): Difference of number of wrongly and correctly predicted samples in leaves.
+        total_error_fraction (numpy.ndarray): Percentage of incorrectly predicted samples in leaves over the total
+            number of errors (used to rank the nodes).
+        error_class_idx (int): Index of class of wrongly predicted samples in the Error Tree.
+        n_total_errors (int): Number of total errors.
+        wrongly_predicted_leaves (numpy.ndarray): Array of number of wrongly predicted samples in leaves.
+        correctly_predicted_leaves (numpy.ndarray): Array of number of correctly predicted samples in leaves.
+        leaf_ids (numpy.ndarray): List of all leaves indices.
 
+    """
     def __init__(self, error_decision_tree):
 
         self._estimator = error_decision_tree

diff --git a/mealy/error_visualizer.py b/mealy/error_visualizer.py
@@ -73,8 +73,7 @@ def _plot_feature_distribution(x_ticks, feature_is_numerical, leaf_data, root_da
 
 
 class ErrorVisualizer(_BaseErrorVisualizer):
-    """
-    ErrorVisualizer provides visual utilities to analyze the Error Tree in ErrorAnalyzer
+    """ErrorVisualizer provides visual utilities to analyze the Error Tree in ErrorAnalyzer.
 
     Args:
         error_analyzer (ErrorAnalyzer): fitted ErrorAnalyzer representing the performance of a primary model.
@@ -99,7 +98,7 @@ def plot_error_tree(self, size=None):
         Args:
             size (tuple): size of the output plot.
 
-        Return:
+        Returns:
             graphviz.Source: graph of the Error Analyzer Tree.
 
         """