Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 27 additions & 10 deletions mealy/error_analysis_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,15 @@


def get_epsilon(difference):
"""
Compute the threshold used to decide whether a prediction is wrong or correct (for regression tasks).
"""Compute the threshold used to decide whether a prediction is wrong or correct (for regression tasks).

Args:
difference (1D-array): The absolute differences between the true target values and the predicted ones (by the primary model).
difference (numpy.ndarray): The absolute differences between the true target values and the predicted ones
(by the primary model).

Return:
epsilon (float): The value of the threshold used to decide whether the prediction for a regression task is wrong or correct
Returns:
float: The value of the threshold used to decide whether the prediction for a regression task
is wrong or correct.
"""
epsilon_range = np.linspace(min(difference), max(difference), num=ErrorAnalyzerConstants.NUMBER_EPSILON_VALUES)
cdf_error = []
Expand All @@ -23,7 +24,18 @@ def get_epsilon(difference):
cdf_error.append(np.count_nonzero(correct_predictions) / float(n_samples))
return KneeLocator(epsilon_range, cdf_error).knee


def get_feature_list_from_column_transformer(ct_preprocessor):
"""Get list of feature names and categorical feature names from a ColumnTransformer preprocessor.

Args:
ct_preprocessor (sklearn.compose.ColumnTransformer): ColumnTransformer containing separate feature
preprocessing steps.

Returns:
all_features (list): list of feature names.
categorical_features (list): list of categorical feature names.
"""
all_features, categorical_features = [], []
for transformer_name, transformer, transformer_feature_names in ct_preprocessor.transformers_:
if transformer_name == 'remainder' and transformer == 'drop':
Expand All @@ -42,16 +54,21 @@ def get_feature_list_from_column_transformer(ct_preprocessor):


def check_lists_having_same_elements(list_A, list_B):
"""Check two lists have the same unique elements."""
return set(list_A) == set(list_B)


def check_enough_data(df, min_len):
"""
Compare length of dataframe to minimum lenght of the test data.
"""Compare length of dataframe to minimum length of the test data.

Used in the relevance of the measure.

:param df: Input dataframe
:param min_len:
:return:
Args:
df (pandas.DataFrame): Input dataframe
min_len (int): Minimum number of rows required.

Raises:
ValueError: If df does not have the required minimum number of rows.
"""
if df.shape[0] < min_len:
raise ValueError(
Expand Down
127 changes: 64 additions & 63 deletions mealy/error_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,28 @@


class ErrorAnalyzer(BaseEstimator):
""" ErrorAnalyzer analyzes the errors of a prediction model on a test set.
"""ErrorAnalyzer analyzes the errors of a prediction model on a test set.

It uses model predictions and ground truth target to compute the model errors on the test set.
It then trains a Decision Tree, called a Error Analyzer Tree, on the same test set by using the model error
as target. The nodes of the decision tree are different segments of errors to be studied individually.

Args:
primary_model (sklearn.base.BaseEstimator or sklearn.pipeline.Pipeline): a sklearn model to analyze. Either an estimator
primary_model (sklearn.base.BaseEstimator or sklearn.pipeline.Pipeline): A sklearn model to analyze. Either an estimator
or a Pipeline containing a ColumnTransformer with the preprocessing steps and an estimator as last step.
feature_names (list of str): list of feature names. Defaults to None.
param_grid (dict): sklearn.tree.DecisionTree hyper-parameters values for grid search.
random_state (int): random seed.
feature_names (list): List of feature names, default=None.
param_grid (dict): The sklearn.tree.DecisionTree hyper-parameters values for grid search.
random_state (int): Random seed.

Attributes:
_error_tree (DecisionTreeClassifier): the estimator used to train the Error Analyzer Tree
_primary_model (sklearn.base.BaseEstimator or sklearn.pipeline.Pipeline): A sklearn model to analyze. Either an estimator
or a Pipeline containing a ColumnTransformer with the preprocessing steps and an estimator as last step.
feature_names (list): List of feature names.
param_grid (dict): The sklearn.tree.DecisionTreeClassifier hyper-parameters values for grid search.
random_state (int): Random seed.
preprocessed_feature_names (list): List of preprocessed feature names.
error_tree (ErrorTree): the Error Tree.
epsilon (float): The threshold used to assess if a prediction is wrong, only used for regression primary models.
"""

def __init__(self, primary_model,
Expand Down Expand Up @@ -106,16 +113,15 @@ def preprocessed_feature_names(self):
return self.pipeline_preprocessor.get_preprocessed_feature_names()

def fit(self, X, y):
"""
Fit the Error Analyzer Tree.
"""Fit the Error Analyzer Tree.

Trains the Error Analyzer Tree, a Decision Tree to discriminate between samples that are correctly
predicted or wrongly predicted (errors) by a primary model.

Args:
X (numpy.ndarray or pandas.DataFrame): feature data from a test set to evaluate the primary predictor and
X (numpy.ndarray or pandas.DataFrame): Feature data from a test set to evaluate the primary predictor and
train a Error Analyzer Tree.
y (numpy.ndarray or pandas.DataFrame): target data from a test set to evaluate the primary predictor and
y (numpy.ndarray or pandas.DataFrame): Target data from a test set to evaluate the primary predictor and
train a Error Analyzer Tree.
"""
logger.info("Preparing the Error Analyzer Tree...")
Expand Down Expand Up @@ -159,15 +165,15 @@ def get_error_leaf_summary(self, leaf_selector=None, add_path_to_leaves=False,
* array-like: Only return information of the leaves corresponding to these ids
* None (default): Return information of all the leaves
add_path_to_leaves (bool): Whether to add information of the path across the tree till the selected node. Defaults to False.
output_format (string): Return format used for the report. Valid values are 'dict' or 'str'. Defaults to 'dict'.
output_format (str): Return format used for the report. Valid values are 'dict' or 'str'. Defaults to 'dict'.
rank_by (str): Ranking criterion for the leaves. Valid values are:
* 'total_error_fraction' (default): rank by the fraction of total error in the node
* 'purity': rank by the purity (ratio of wrongly predicted samples over the total number of node samples)
* 'class_difference': rank by the difference of number of wrongly and correctly predicted samples
in a node.

Return:
dict or str: list of reports (as dictionary or string) with different information on each selected leaf.
Returns:
dict or str: List of reports (as dictionary or string) with different information on each selected leaf.
"""

leaf_nodes = self._get_ranked_leaf_ids(leaf_selector=leaf_selector, rank_by=rank_by)
Expand Down Expand Up @@ -214,37 +220,31 @@ def evaluate(self, X, y, output_format='str'):
Return ErrorAnalyzer summary metrics regarding the Error Tree.

Args:
X (numpy.ndarray or pandas.DataFrame): feature data from a test set to evaluate the primary predictor
X (numpy.ndarray or pandas.DataFrame): Feature data from a test set to evaluate the primary predictor
and train a Error Analyzer Tree.
y (numpy.ndarray or pandas.DataFrame): target data from a test set to evaluate the primary predictor and
y (numpy.ndarray or pandas.DataFrame): Target data from a test set to evaluate the primary predictor and
train a Error Analyzer Tree.
output_format (string): Return format used for the report. Valid values are 'dict' or 'str'. Defaults to 'str'.
output_format (str): Return format used for the report. Valid values are 'dict' or 'str'. Defaults to 'str'.

Return:
dict or str: dictionary or string report storing different metrics regarding the Error Decision Tree.
Returns:
dict or str: Dictionary or string report storing different metrics regarding the Error Tree.
"""
prep_x, prep_y = self.pipeline_preprocessor.transform(X), np.array(y)
y_pred = self.error_tree.estimator_.predict(prep_x)
y_true, _ = self._compute_primary_model_error(prep_x, prep_y)
return error_decision_tree_report(y_true, y_pred, output_format)

def _compute_primary_model_error(self, X, y):
"""
Computes the errors of the primary model predictions and samples
"""Computes the errors of the primary model predictions and samples.

Args:
X: array-like of shape (n_samples, n_features)
Input samples.

y: array-like of shape (n_samples,)
True target values for `X`.
X (numpy.ndarray): Input samples of shape `(n_samples, n_features)`.
y (numpy.ndarray): True target values for `X` of shape `(n_samples,)`.

Returns:
sampled_X: ndarray
A sample of `X`.

error_y: array of string of shape (n_sampled_X, )
Boolean value of whether or not the primary model predicted correctly or incorrectly the samples in sampled_X.
error_y (numpy.ndarray): Array of booleans of shape `(len(y),)`, containing a boolean value of whether or
not the primary model got the prediction right.
error_rate (float): Accuracy of the primary model.
"""
if is_regressor(self._primary_model) or len(np.unique(y)) > 2:
# regression or multiclass classification models: no proba threshold
Expand All @@ -258,22 +258,16 @@ def _compute_primary_model_error(self, X, y):
return error_y, error_rate

def _evaluate_primary_model_predictions(self, y_true, y_pred):
"""
Compute errors of the primary model on the test set
"""Compute errors of the primary model on the test set.

Args:
y_true: 1D array
True target values.

y_pred: 1D array
Predictions of the primary model.
y_true (numpy.ndarray): True target values.
y_pred (numpy.ndarray): Predictions of the primary model.

Return:
error_y: array of string of len(y_true)
Boolean value of whether or not the primary model got the prediction right.

error_rate: float
Accuracy of the primary model
Returns:
error_y (numpy.ndarray): Array of booleans of shape `(len(y),)`, containing a boolean value of whether or
not the primary model got the prediction right.
error_rate (float): Accuracy of the primary model.
"""

error_y = np.full_like(y_true, ErrorAnalyzerConstants.CORRECT_PREDICTION, dtype="O")
Expand All @@ -300,18 +294,18 @@ def _get_ranked_leaf_ids(self, leaf_selector=None, rank_by='total_error_fraction
""" Select error nodes and rank them by importance.

Args:
leaf_selector (None, int or array-like): the leaves whose information will be returned
leaf_selector (None, int or array-like): The leaves whose information will be returned
* int: Only return information of the leaf with the corresponding id
* array-like: Only return information of the leaves corresponding to these ids
* None (default): Return information of all the leaves
rank_by (str): ranking criterion for the leaves. Valid values are:
rank_by (str): Ranking criterion for the leaves. Valid values are:
* 'total_error_fraction': rank by the fraction of total error in the node
* 'purity': rank by the purity (ratio of wrongly predicted samples over the total number of node samples)
* 'class_difference': rank by the difference of number of wrongly and correctly predicted samples
in a node.

Return:
list or numpy.ndarray: list of selected leaves indices.
Returns:
list or numpy.ndarray: List of selected leaves indices.

"""
apply_leaf_selector = self._get_leaf_selector(leaf_selector)
Expand All @@ -331,20 +325,20 @@ def _get_ranked_leaf_ids(self, leaf_selector=None, rank_by='total_error_fraction

#TODO leaf_selector is taking too many different types of data ?
def _get_leaf_selector(self, leaf_selector):
"""
Return a function that select rows of provided arrays. Arrays must be of shape (1, number of leaves)
Args:
leaf_selector: None, int or array-like
How to select the rows of the array
* int: Only keep the row corresponding to this leaf id
* array-like: Only keep the rows corresponding to these leaf ids
* None (default): Keep the whole array of leaf ids

Return:
A function with one argument array as a selector of leaf ids
"""Return a function that select rows of provided arrays.

Arrays must be of shape (1, number of leaves).

Args:
leaf_selector (None, int or array-like): How to select the rows of the array
* int: Only keep the row corresponding to this leaf id
* array-like: Only keep the rows corresponding to these leaf ids
* None (default): Keep the whole array of leaf ids

Returns:
A function with one argument `array` as a selector of leaf ids.
Args:
array: numpy array of shape (1, number of leaves)
An array of which we only want to keep some rows
array (numpy.array): array of shape (1, number of leaves) of which we only want to keep some rows.
"""
if leaf_selector is None:
return lambda array: array
Expand All @@ -359,7 +353,14 @@ def _get_leaf_selector(self, leaf_selector):
return lambda array: array[leaf_selector]

def _get_path_to_node(self, node_id):
""" Return path to node as a list of split steps from the nodes of the sklearn Tree object """
""" Return path to node as a list of split steps from the nodes of the sklearn Tree object.

Args:
node_id (int): Node identifier.

Returns:
path_to_node (str): Text describing the path from the root to the input node.
"""
feature_names = self.pipeline_preprocessor.get_original_feature_names()
children_left = list(self.error_tree.estimator_.tree_.children_left)
children_right = list(self.error_tree.estimator_.tree_.children_right)
Expand Down Expand Up @@ -404,7 +405,7 @@ def _inverse_transform_features(self):
indicate what features are used to split the training set at each node.
See https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html.

Return:
Returns:
list or numpy.ndarray:
indices of features of the Error Analyzer Tree, possibly mapped back to the
original unprocessed feature space.
Expand All @@ -421,7 +422,7 @@ def _inverse_transform_thresholds(self):
the decision tree. The thresholds of a decision tree are the feature values used to split the training set at
each node. See https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html.

Return:
Returns:
numpy.ndarray:
thresholds of the Error Tree, possibly with preprocessing undone.
"""
Expand Down
24 changes: 24 additions & 0 deletions mealy/error_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,32 @@
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format='mealy | %(levelname)s - %(message)s')


class ErrorTree(object):
""" ErrorTree analyzes the errors of a prediction model on a test set.

It uses model predictions and ground truth target to compute the model errors on the test set.
It then trains a Decision Tree, called a Error Analyzer Tree, on the same test set by using the model error
as target. The nodes of the decision tree are different segments of errors to be studied individually.

Args:
error_decision_tree (sklearn.tree.DecisionTreeClassifier): The estimator used to train the Error Tree.

Attributes:
estimator_ (sklearn.tree.DecisionTreeClassifier): The estimator used to train the Error Tree.
impurity (numpy.ndarray): Impurity of leaves.
quantized_impurity (numpy.ndarray): Impurity of leaves quantized into ErrorAnalyzerConstants.NUMBER_PURITY_LEVELS
levels.
difference (numpy.ndarray): Difference of number of wrongly and correctly predicted samples in leaves.
total_error_fraction (numpy.ndarray): Percentage of incorrectly predicted samples in leaves over the total
number of errors (used to rank the nodes).
error_class_idx (int): Index of class of wrongly predicted samples in the Error Tree.
n_total_errors (int): Number of total errors.
wrongly_predicted_leaves (numpy.ndarray): Array of number of wrongly predicted samples in leaves.
correctly_predicted_leaves (numpy.ndarray): Array of number of correctly predicted samples in leaves.
leaf_ids (numpy.ndarray): List of all leaves indices.

"""
def __init__(self, error_decision_tree):

self._estimator = error_decision_tree
Expand Down
5 changes: 2 additions & 3 deletions mealy/error_visualizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,7 @@ def _plot_feature_distribution(x_ticks, feature_is_numerical, leaf_data, root_da


class ErrorVisualizer(_BaseErrorVisualizer):
"""
ErrorVisualizer provides visual utilities to analyze the Error Tree in ErrorAnalyzer
"""ErrorVisualizer provides visual utilities to analyze the Error Tree in ErrorAnalyzer.

Args:
error_analyzer (ErrorAnalyzer): fitted ErrorAnalyzer representing the performance of a primary model.
Expand All @@ -99,7 +98,7 @@ def plot_error_tree(self, size=None):
Args:
size (tuple): size of the output plot.

Return:
Returns:
graphviz.Source: graph of the Error Analyzer Tree.

"""
Expand Down
Loading