diff --git a/mealy/error_analysis_utils.py b/mealy/error_analysis_utils.py index 4166248..24307b0 100644 --- a/mealy/error_analysis_utils.py +++ b/mealy/error_analysis_utils.py @@ -6,14 +6,15 @@ def get_epsilon(difference): - """ - Compute the threshold used to decide whether a prediction is wrong or correct (for regression tasks). + """Compute the threshold used to decide whether a prediction is wrong or correct (for regression tasks). Args: - difference (1D-array): The absolute differences between the true target values and the predicted ones (by the primary model). + difference (numpy.ndarray): The absolute differences between the true target values and the predicted ones + (by the primary model). - Return: - epsilon (float): The value of the threshold used to decide whether the prediction for a regression task is wrong or correct + Returns: + float: The value of the threshold used to decide whether the prediction for a regression task + is wrong or correct. """ epsilon_range = np.linspace(min(difference), max(difference), num=ErrorAnalyzerConstants.NUMBER_EPSILON_VALUES) cdf_error = [] @@ -23,7 +24,18 @@ def get_epsilon(difference): cdf_error.append(np.count_nonzero(correct_predictions) / float(n_samples)) return KneeLocator(epsilon_range, cdf_error).knee + def get_feature_list_from_column_transformer(ct_preprocessor): + """Get list of feature names and categorical feature names from a ColumnTransformer preprocessor. + + Args: + ct_preprocessor (sklearn.compose.ColumnTransformer): ColumnTransformer containing separate feature + preprocessing steps. + + Returns: + all_features (list): list of feature names. + categorical_features (list): list of categorical feature names. + """ all_features, categorical_features = [], [] for transformer_name, transformer, transformer_feature_names in ct_preprocessor.transformers_: if transformer_name == 'remainder' and transformer == 'drop': @@ -42,16 +54,21 @@ def get_feature_list_from_column_transformer(ct_preprocessor): def check_lists_having_same_elements(list_A, list_B): + """Check two lists have the same unique elements.""" return set(list_A) == set(list_B) + def check_enough_data(df, min_len): - """ - Compare length of dataframe to minimum lenght of the test data. + """Compare length of dataframe to minimum length of the test data. + Used in the relevance of the measure. - :param df: Input dataframe - :param min_len: - :return: + Args: + df (pandas.DataFrame): Input dataframe + min_len (int): Minimum number of rows required. + + Raises: + ValueError: If df does not have the required minimum number of rows. """ if df.shape[0] < min_len: raise ValueError( diff --git a/mealy/error_analyzer.py b/mealy/error_analyzer.py index 9eb024a..cde0bb4 100644 --- a/mealy/error_analyzer.py +++ b/mealy/error_analyzer.py @@ -22,21 +22,28 @@ class ErrorAnalyzer(BaseEstimator): - """ ErrorAnalyzer analyzes the errors of a prediction model on a test set. + """ErrorAnalyzer analyzes the errors of a prediction model on a test set. It uses model predictions and ground truth target to compute the model errors on the test set. It then trains a Decision Tree, called a Error Analyzer Tree, on the same test set by using the model error as target. The nodes of the decision tree are different segments of errors to be studied individually. Args: - primary_model (sklearn.base.BaseEstimator or sklearn.pipeline.Pipeline): a sklearn model to analyze. Either an estimator + primary_model (sklearn.base.BaseEstimator or sklearn.pipeline.Pipeline): A sklearn model to analyze. Either an estimator or a Pipeline containing a ColumnTransformer with the preprocessing steps and an estimator as last step. - feature_names (list of str): list of feature names. Defaults to None. - param_grid (dict): sklearn.tree.DecisionTree hyper-parameters values for grid search. - random_state (int): random seed. + feature_names (list): List of feature names, default=None. + param_grid (dict): The sklearn.tree.DecisionTree hyper-parameters values for grid search. + random_state (int): Random seed. Attributes: - _error_tree (DecisionTreeClassifier): the estimator used to train the Error Analyzer Tree + _primary_model (sklearn.base.BaseEstimator or sklearn.pipeline.Pipeline): A sklearn model to analyze. Either an estimator + or a Pipeline containing a ColumnTransformer with the preprocessing steps and an estimator as last step. + feature_names (list): List of feature names. + param_grid (dict): The sklearn.tree.DecisionTreeClassifier hyper-parameters values for grid search. + random_state (int): Random seed. + preprocessed_feature_names (list): List of preprocessed feature names. + error_tree (ErrorTree): the Error Tree. + epsilon (float): The threshold used to assess if a prediction is wrong, only used for regression primary models. """ def __init__(self, primary_model, @@ -106,16 +113,15 @@ def preprocessed_feature_names(self): return self.pipeline_preprocessor.get_preprocessed_feature_names() def fit(self, X, y): - """ - Fit the Error Analyzer Tree. + """Fit the Error Analyzer Tree. Trains the Error Analyzer Tree, a Decision Tree to discriminate between samples that are correctly predicted or wrongly predicted (errors) by a primary model. Args: - X (numpy.ndarray or pandas.DataFrame): feature data from a test set to evaluate the primary predictor and + X (numpy.ndarray or pandas.DataFrame): Feature data from a test set to evaluate the primary predictor and train a Error Analyzer Tree. - y (numpy.ndarray or pandas.DataFrame): target data from a test set to evaluate the primary predictor and + y (numpy.ndarray or pandas.DataFrame): Target data from a test set to evaluate the primary predictor and train a Error Analyzer Tree. """ logger.info("Preparing the Error Analyzer Tree...") @@ -159,15 +165,15 @@ def get_error_leaf_summary(self, leaf_selector=None, add_path_to_leaves=False, * array-like: Only return information of the leaves corresponding to these ids * None (default): Return information of all the leaves add_path_to_leaves (bool): Whether to add information of the path across the tree till the selected node. Defaults to False. - output_format (string): Return format used for the report. Valid values are 'dict' or 'str'. Defaults to 'dict'. + output_format (str): Return format used for the report. Valid values are 'dict' or 'str'. Defaults to 'dict'. rank_by (str): Ranking criterion for the leaves. Valid values are: * 'total_error_fraction' (default): rank by the fraction of total error in the node * 'purity': rank by the purity (ratio of wrongly predicted samples over the total number of node samples) * 'class_difference': rank by the difference of number of wrongly and correctly predicted samples in a node. - Return: - dict or str: list of reports (as dictionary or string) with different information on each selected leaf. + Returns: + dict or str: List of reports (as dictionary or string) with different information on each selected leaf. """ leaf_nodes = self._get_ranked_leaf_ids(leaf_selector=leaf_selector, rank_by=rank_by) @@ -214,14 +220,14 @@ def evaluate(self, X, y, output_format='str'): Return ErrorAnalyzer summary metrics regarding the Error Tree. Args: - X (numpy.ndarray or pandas.DataFrame): feature data from a test set to evaluate the primary predictor + X (numpy.ndarray or pandas.DataFrame): Feature data from a test set to evaluate the primary predictor and train a Error Analyzer Tree. - y (numpy.ndarray or pandas.DataFrame): target data from a test set to evaluate the primary predictor and + y (numpy.ndarray or pandas.DataFrame): Target data from a test set to evaluate the primary predictor and train a Error Analyzer Tree. - output_format (string): Return format used for the report. Valid values are 'dict' or 'str'. Defaults to 'str'. + output_format (str): Return format used for the report. Valid values are 'dict' or 'str'. Defaults to 'str'. - Return: - dict or str: dictionary or string report storing different metrics regarding the Error Decision Tree. + Returns: + dict or str: Dictionary or string report storing different metrics regarding the Error Tree. """ prep_x, prep_y = self.pipeline_preprocessor.transform(X), np.array(y) y_pred = self.error_tree.estimator_.predict(prep_x) @@ -229,22 +235,16 @@ def evaluate(self, X, y, output_format='str'): return error_decision_tree_report(y_true, y_pred, output_format) def _compute_primary_model_error(self, X, y): - """ - Computes the errors of the primary model predictions and samples + """Computes the errors of the primary model predictions and samples. Args: - X: array-like of shape (n_samples, n_features) - Input samples. - - y: array-like of shape (n_samples,) - True target values for `X`. + X (numpy.ndarray): Input samples of shape `(n_samples, n_features)`. + y (numpy.ndarray): True target values for `X` of shape `(n_samples,)`. Returns: - sampled_X: ndarray - A sample of `X`. - - error_y: array of string of shape (n_sampled_X, ) - Boolean value of whether or not the primary model predicted correctly or incorrectly the samples in sampled_X. + error_y (numpy.ndarray): Array of booleans of shape `(len(y),)`, containing a boolean value of whether or + not the primary model got the prediction right. + error_rate (float): Accuracy of the primary model. """ if is_regressor(self._primary_model) or len(np.unique(y)) > 2: # regression or multiclass classification models: no proba threshold @@ -258,22 +258,16 @@ def _compute_primary_model_error(self, X, y): return error_y, error_rate def _evaluate_primary_model_predictions(self, y_true, y_pred): - """ - Compute errors of the primary model on the test set + """Compute errors of the primary model on the test set. Args: - y_true: 1D array - True target values. - - y_pred: 1D array - Predictions of the primary model. + y_true (numpy.ndarray): True target values. + y_pred (numpy.ndarray): Predictions of the primary model. - Return: - error_y: array of string of len(y_true) - Boolean value of whether or not the primary model got the prediction right. - - error_rate: float - Accuracy of the primary model + Returns: + error_y (numpy.ndarray): Array of booleans of shape `(len(y),)`, containing a boolean value of whether or + not the primary model got the prediction right. + error_rate (float): Accuracy of the primary model. """ error_y = np.full_like(y_true, ErrorAnalyzerConstants.CORRECT_PREDICTION, dtype="O") @@ -300,18 +294,18 @@ def _get_ranked_leaf_ids(self, leaf_selector=None, rank_by='total_error_fraction """ Select error nodes and rank them by importance. Args: - leaf_selector (None, int or array-like): the leaves whose information will be returned + leaf_selector (None, int or array-like): The leaves whose information will be returned * int: Only return information of the leaf with the corresponding id * array-like: Only return information of the leaves corresponding to these ids * None (default): Return information of all the leaves - rank_by (str): ranking criterion for the leaves. Valid values are: + rank_by (str): Ranking criterion for the leaves. Valid values are: * 'total_error_fraction': rank by the fraction of total error in the node * 'purity': rank by the purity (ratio of wrongly predicted samples over the total number of node samples) * 'class_difference': rank by the difference of number of wrongly and correctly predicted samples in a node. - Return: - list or numpy.ndarray: list of selected leaves indices. + Returns: + list or numpy.ndarray: List of selected leaves indices. """ apply_leaf_selector = self._get_leaf_selector(leaf_selector) @@ -331,20 +325,20 @@ def _get_ranked_leaf_ids(self, leaf_selector=None, rank_by='total_error_fraction #TODO leaf_selector is taking too many different types of data ? def _get_leaf_selector(self, leaf_selector): - """ - Return a function that select rows of provided arrays. Arrays must be of shape (1, number of leaves) - Args: - leaf_selector: None, int or array-like - How to select the rows of the array - * int: Only keep the row corresponding to this leaf id - * array-like: Only keep the rows corresponding to these leaf ids - * None (default): Keep the whole array of leaf ids - - Return: - A function with one argument array as a selector of leaf ids + """Return a function that select rows of provided arrays. + + Arrays must be of shape (1, number of leaves). + + Args: + leaf_selector (None, int or array-like): How to select the rows of the array + * int: Only keep the row corresponding to this leaf id + * array-like: Only keep the rows corresponding to these leaf ids + * None (default): Keep the whole array of leaf ids + + Returns: + A function with one argument `array` as a selector of leaf ids. Args: - array: numpy array of shape (1, number of leaves) - An array of which we only want to keep some rows + array (numpy.array): array of shape (1, number of leaves) of which we only want to keep some rows. """ if leaf_selector is None: return lambda array: array @@ -359,7 +353,14 @@ def _get_leaf_selector(self, leaf_selector): return lambda array: array[leaf_selector] def _get_path_to_node(self, node_id): - """ Return path to node as a list of split steps from the nodes of the sklearn Tree object """ + """ Return path to node as a list of split steps from the nodes of the sklearn Tree object. + + Args: + node_id (int): Node identifier. + + Returns: + path_to_node (str): Text describing the path from the root to the input node. + """ feature_names = self.pipeline_preprocessor.get_original_feature_names() children_left = list(self.error_tree.estimator_.tree_.children_left) children_right = list(self.error_tree.estimator_.tree_.children_right) @@ -404,7 +405,7 @@ def _inverse_transform_features(self): indicate what features are used to split the training set at each node. See https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html. - Return: + Returns: list or numpy.ndarray: indices of features of the Error Analyzer Tree, possibly mapped back to the original unprocessed feature space. @@ -421,7 +422,7 @@ def _inverse_transform_thresholds(self): the decision tree. The thresholds of a decision tree are the feature values used to split the training set at each node. See https://scikit-learn.org/stable/auto_examples/tree/plot_unveil_tree_structure.html. - Return: + Returns: numpy.ndarray: thresholds of the Error Tree, possibly with preprocessing undone. """ diff --git a/mealy/error_tree.py b/mealy/error_tree.py index 1688a6b..47b11e6 100644 --- a/mealy/error_tree.py +++ b/mealy/error_tree.py @@ -6,8 +6,32 @@ logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO, format='mealy | %(levelname)s - %(message)s') + class ErrorTree(object): + """ ErrorTree analyzes the errors of a prediction model on a test set. + + It uses model predictions and ground truth target to compute the model errors on the test set. + It then trains a Decision Tree, called a Error Analyzer Tree, on the same test set by using the model error + as target. The nodes of the decision tree are different segments of errors to be studied individually. + + Args: + error_decision_tree (sklearn.tree.DecisionTreeClassifier): The estimator used to train the Error Tree. + + Attributes: + estimator_ (sklearn.tree.DecisionTreeClassifier): The estimator used to train the Error Tree. + impurity (numpy.ndarray): Impurity of leaves. + quantized_impurity (numpy.ndarray): Impurity of leaves quantized into ErrorAnalyzerConstants.NUMBER_PURITY_LEVELS + levels. + difference (numpy.ndarray): Difference of number of wrongly and correctly predicted samples in leaves. + total_error_fraction (numpy.ndarray): Percentage of incorrectly predicted samples in leaves over the total + number of errors (used to rank the nodes). + error_class_idx (int): Index of class of wrongly predicted samples in the Error Tree. + n_total_errors (int): Number of total errors. + wrongly_predicted_leaves (numpy.ndarray): Array of number of wrongly predicted samples in leaves. + correctly_predicted_leaves (numpy.ndarray): Array of number of correctly predicted samples in leaves. + leaf_ids (numpy.ndarray): List of all leaves indices. + """ def __init__(self, error_decision_tree): self._estimator = error_decision_tree diff --git a/mealy/error_visualizer.py b/mealy/error_visualizer.py index 2cf87c0..dfdd7fc 100644 --- a/mealy/error_visualizer.py +++ b/mealy/error_visualizer.py @@ -73,8 +73,7 @@ def _plot_feature_distribution(x_ticks, feature_is_numerical, leaf_data, root_da class ErrorVisualizer(_BaseErrorVisualizer): - """ - ErrorVisualizer provides visual utilities to analyze the Error Tree in ErrorAnalyzer + """ErrorVisualizer provides visual utilities to analyze the Error Tree in ErrorAnalyzer. Args: error_analyzer (ErrorAnalyzer): fitted ErrorAnalyzer representing the performance of a primary model. @@ -99,7 +98,7 @@ def plot_error_tree(self, size=None): Args: size (tuple): size of the output plot. - Return: + Returns: graphviz.Source: graph of the Error Analyzer Tree. """ diff --git a/mealy/metrics.py b/mealy/metrics.py index 1922bb5..8cf004e 100644 --- a/mealy/metrics.py +++ b/mealy/metrics.py @@ -5,6 +5,19 @@ def compute_confidence_decision(primary_model_true_accuracy, primary_model_predicted_accuracy): + """Return fidelity of the Error Tree and decision regarding its reliability. + + Args: + primary_model_true_accuracy (numpy.ndarray): Ground truth values of wrong/correct predictions of the error tree + primary model. Expected values in [ErrorAnalyzerConstants.WRONG_PREDICTION, + ErrorAnalyzerConstants.CORRECT_PREDICTION]. + primary_model_predicted_accuracy (numpy.ndarray): Estimated targets as returned by the error tree. Expected + values in [ErrorAnalyzerConstants.WRONG_PREDICTION, ErrorAnalyzerConstants.CORRECT_PREDICTION]. + + Returns: + fidelity (float): Fidelity score, measuring how well the Error Tree represents the original model errors. + decision (bool): Decision regarding whether to trust the Error Tree. + """ difference_true_pred_accuracy = np.abs(primary_model_true_accuracy - primary_model_predicted_accuracy) decision = difference_true_pred_accuracy <= ErrorAnalyzerConstants.TREE_ACCURACY_TOLERANCE @@ -15,15 +28,36 @@ def compute_confidence_decision(primary_model_true_accuracy, primary_model_predi def compute_accuracy_score(y_true, y_pred): + """Return the accuracy of predictions with respect to true values.""" return accuracy_score(y_true, y_pred) def compute_primary_model_accuracy(y): + """Return accuracy of the primary model. + + Args: + y (numpy.ndarray): Array indicating whether the model is correct for each sample. Expected values in + [ErrorAnalyzerConstants.WRONG_PREDICTION, ErrorAnalyzerConstants.CORRECT_PREDICTION]. + + Returns: + float: Estimated accuracy of the primary model. + """ n_test_samples = y.shape[0] return float(np.count_nonzero(y == ErrorAnalyzerConstants.CORRECT_PREDICTION)) / n_test_samples def compute_fidelity_score(y_true, y_pred): + """Return fidelity of the Error Tree. + + Args: + y_true (numpy.ndarray): Ground truth values of wrong/correct predictions of the error tree primary model. + Expected values in [ErrorAnalyzerConstants.WRONG_PREDICTION, ErrorAnalyzerConstants.CORRECT_PREDICTION]. + y_pred (numpy.ndarray): Estimated targets as returned by the error tree. Expected values in + [ErrorAnalyzerConstants.WRONG_PREDICTION, ErrorAnalyzerConstants.CORRECT_PREDICTION]. + + Returns: + fidelity (float): Fidelity score, measuring how well the Error Tree represents the original model errors. + """ difference_true_pred_accuracy = np.abs(compute_primary_model_accuracy(y_true) - compute_primary_model_accuracy(y_pred)) fidelity = 1. - difference_true_pred_accuracy @@ -32,21 +66,32 @@ def compute_fidelity_score(y_true, y_pred): def fidelity_balanced_accuracy_score(y_true, y_pred): + """Return a custom metrics, as the sum of the fidelity and the balanced accuracy of the Error Tree. + + Args: + y_true (numpy.ndarray): Ground truth values of wrong/correct predictions of the error tree primary model. + Expected values in [ErrorAnalyzerConstants.WRONG_PREDICTION, ErrorAnalyzerConstants.CORRECT_PREDICTION]. + y_pred (numpy.ndarray): Estimated targets as returned by the error tree. Expected values in + [ErrorAnalyzerConstants.WRONG_PREDICTION, ErrorAnalyzerConstants.CORRECT_PREDICTION]. + + Returns: + dict or str: Dictionary or string report storing different metrics regarding the Error Tree. + """ return compute_fidelity_score(y_true, y_pred) + balanced_accuracy_score(y_true, y_pred) def error_decision_tree_report(y_true, y_pred, output_format='str'): - """Return a report showing the main Error Decision Tree metrics. + """Return a report showing the main Error Tree metrics. Args: y_true (numpy.ndarray): Ground truth values of wrong/correct predictions of the error tree primary model. Expected values in [ErrorAnalyzerConstants.WRONG_PREDICTION, ErrorAnalyzerConstants.CORRECT_PREDICTION]. y_pred (numpy.ndarray): Estimated targets as returned by the error tree. Expected values in [ErrorAnalyzerConstants.WRONG_PREDICTION, ErrorAnalyzerConstants.CORRECT_PREDICTION]. - output_format (string): Return format used for the report. Valid values are 'dict' or 'str'. + output_format (str): Return format used for the report. Valid values are 'dict' or 'str'. - Return: - dict or str: dictionary or string report storing different metrics regarding the Error Decision Tree. + Returns: + dict or str: Dictionary or string report storing different metrics regarding the Error Tree. """ tree_accuracy_score = compute_accuracy_score(y_true, y_pred) @@ -67,7 +112,7 @@ def error_decision_tree_report(y_true, y_pred, output_format='str'): if output_format == 'str': - report = 'The Error Decision Tree was trained with accuracy %.2f%% and balanced accuracy %.2f%%.' % (tree_accuracy_score * 100, tree_balanced_accuracy * 100) + report = 'The Error Tree was trained with accuracy %.2f%% and balanced accuracy %.2f%%.' % (tree_accuracy_score * 100, tree_balanced_accuracy * 100) report += '\n' report += 'The Decision Tree estimated the primary model''s accuracy to %.2f%%.' % \ (primary_model_predicted_accuracy * 100) diff --git a/mealy/preprocessing.py b/mealy/preprocessing.py index 25f509c..48cf33c 100644 --- a/mealy/preprocessing.py +++ b/mealy/preprocessing.py @@ -12,19 +12,19 @@ class FeatureNameTransformer(object): - """ Transformer of feature names and indices. + """Transformer of feature names and indices. - A FeatureNameTransformer parses an input Pipeline preprocessor and generate - a mapping between the input unprocessed feature names/indices and the output - preprocessed feature names/indices. + A FeatureNameTransformer parses an input Pipeline preprocessor and generate + a mapping between the input unprocessed feature names/indices and the output + preprocessed feature names/indices. - Args: - ct_preprocessor (sklearn.compose.ColumnTransformer): preprocessor. - orig_feats (list): list of original unpreprocessed feature names, default=None. + Args: + ct_preprocessor (sklearn.compose.ColumnTransformer): Preprocessor. + original_features (list): List of original unpreprocessed feature names, default=None. - Attributes: - original_feature_names (list): list of original unpreprocessed feature names. - preprocessed_feature_names (list): list of preprocessed feature names. + Attributes: + original_feature_names (list): List of original unpreprocessed feature names. + preprocessed_feature_names (list): List of preprocessed feature names. """ def __init__(self, original_features, preprocessed_features): @@ -32,9 +32,11 @@ def __init__(self, original_features, preprocessed_features): self.preprocessed_feature_names = preprocessed_features def get_original_feature_names(self): + """Get the list of original unpreprocessed feature names.""" return self.original_feature_names def get_preprocessed_feature_names(self): + """Get the list of preprocessed feature names.""" return self.preprocessed_feature_names def is_categorical(self, index=None, name=None): @@ -59,14 +61,13 @@ def inverse_thresholds(self, tree, n_cols): class PipelinePreprocessor(FeatureNameTransformer): """Transformer of feature values from the original values to preprocessed ones. - A PipelinePreprocessor parses an input Pipeline preprocessor and generate - a mapping between the input unprocessed feature values and the output - preprocessed feature values. - - Args: - ct_preprocessor (sklearn.compose.ColumnTransformer): preprocessing steps. - original_features (list): list of original unpreprocessed feature names, default=None. + A PipelinePreprocessor parses an input Pipeline preprocessor and generate + a mapping between the input unprocessed feature values and the output + preprocessed feature values. + Args: + ct_preprocessor (sklearn.compose.ColumnTransformer): Preprocessing steps + original_features (list): List of original unpreprocessed feature names, default=None. """ def __init__(self, ct_preprocessor, original_features=None): @@ -89,8 +90,7 @@ def __init__(self, ct_preprocessor, original_features=None): self._create_feature_mapping(ct_preprocessor) def _create_feature_mapping(self, ct_preprocessor): - """ - Update the dicts of input <-> output feature id mapping: self.original2preprocessed and self.preprocessed2original + """Update the dicts of input <-> output feature id mapping: original2preprocessed and preprocessed2original. Args: ct_preprocessor: a ColumnTransformer object. @@ -160,7 +160,7 @@ def transform(self, x): """Transform the input feature values according to the preprocessing pipeline. Args: - x (array-like or dataframe of shape (number of samples, number of features)): input feature values. + x (array-like or dataframe of shape (number of samples, number of features)): Input feature values. Return: numpy.ndarray: transformed feature values. @@ -194,11 +194,10 @@ def inverse_transform(self, preprocessed_x): """Invert the preprocessing pipeline and inverse transform feature values. Args: - preprocessed_x (numpy.ndarray or scipy sparse matrix): preprocessed feature values. - - Return: - numpy.ndarray: feature values without preprocessing. + preprocessed_x (numpy.ndarray or scipy sparse matrix): Preprocessed feature values. + Returns: + numpy.ndarray: Feature values without preprocessing. """ original_features = self.get_original_feature_names() undo_prep_test_x = np.zeros((preprocessed_x.shape[0], len(original_features)), dtype='O') @@ -298,11 +297,13 @@ def __init__(self, model_performance_predictor_features): preprocessed_features=model_performance_predictor_features) def transform(self, x): - """ + """ Transformation of the input (here, transformation is identity). + Args: - x (array-like or dataframe of shape (number of samples, number of features)): input feature values. + x (array-like or pandas.DataFrale of shape (number of samples, number of features)): Input feature values. + Returns: - ndarray + numpy.ndarray: Transformed feature values. """ if isinstance(x, pd.DataFrame): return x.values