diff --git a/amlb/datautils.py b/amlb/datautils.py index 1d4a31bd2..3e0f967a4 100644 --- a/amlb/datautils.py +++ b/amlb/datautils.py @@ -14,7 +14,9 @@ import numpy as np import pandas as pd from sklearn.base import TransformerMixin -from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, log_loss, balanced_accuracy_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, roc_auc_score # just aliasing +from sklearn.metrics import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, \ + log_loss, mean_absolute_error, mean_squared_error, mean_squared_log_error, precision_recall_curve, \ + r2_score, roc_auc_score # just aliasing from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder from .utils import profile, path_from_split, repr_def, split_path, touch diff --git a/amlb/results.py b/amlb/results.py index 3fa6ac37c..3cb5f0727 100644 --- a/amlb/results.py +++ b/amlb/results.py @@ -16,7 +16,9 @@ import pandas as pd from .data import Dataset, DatasetType, Feature -from .datautils import accuracy_score, confusion_matrix, f1_score, log_loss, balanced_accuracy_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, roc_auc_score, read_csv, write_csv, is_data_frame, to_data_frame +from .datautils import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, log_loss, \ + mean_absolute_error, mean_squared_error, mean_squared_log_error, precision_recall_curve, r2_score, roc_auc_score, \ + read_csv, write_csv, is_data_frame, to_data_frame from .resources import get as rget, config as rconfig, output_dirs from .utils import Namespace, backup_file, cached, datetime_iso, json_load, memoize, profile @@ -394,6 +396,10 @@ def do_score(m): for metric in metadata.metrics or []: scores[metric] = do_score(metric) scores.result = scores[scores.metric] if scores.metric in scores else do_score(scores.metric) + if not higher_is_better(scores.metric): + scores.metric = f"neg_{scores.metric}" + scores.result = - scores.result + scores.info = result.info if scoring_errors: scores.info = "; ".join(filter(lambda it: it, [scores.info, *scoring_errors])) @@ -453,6 +459,8 @@ def __init__(self, error): class ClassificationResult(Result): + multi_class_average = 'weighted' # used by metrics like fbeta or auc + def __init__(self, predictions_df, info=None): super().__init__(predictions_df, info) self.classes = self.df.columns[:-2].values.astype(str, copy=False) @@ -464,42 +472,80 @@ def __init__(self, predictions_df, info=None): self.labels = self._autoencode(self.classes) def acc(self): + """Accuracy""" return float(accuracy_score(self.truth, self.predictions)) - def balacc(self): - return float(balanced_accuracy_score(self.truth, self.predictions)) - def auc(self): + """Array Under (ROC) Curve, computed on probabilities, not on predictions""" if self.type != DatasetType.binary: - # raise ValueError("AUC metric is only supported for binary classification: {}.".format(self.classes)) - log.warning("AUC metric is only supported for binary classification: %s.", self.labels) + log.warning("For multiclass problems, please use `auc_ovr` or `auc_ovo` metrics instead of `auc`.") return nan - return float(roc_auc_score(self.truth, self.probabilities[:, 1], labels=self.labels)) + return float(roc_auc_score(self.truth, self.probabilities[:, 1])) - def cm(self): - return confusion_matrix(self.truth, self.predictions, labels=self.labels) + def auc_ovo(self): + """AUC One-vs-One""" + return self._auc_multi(mc='ovo') - def _per_class_errors(self): - return [(s-d)/s for s, d in ((sum(r), r[i]) for i, r in enumerate(self.cm()))] + def auc_ovr(self): + """AUC One-vs-Rest""" + return self._auc_multi(mc='ovr') - def mean_pce(self): - """mean per class error""" - return statistics.mean(self._per_class_errors()) + def balacc(self): + """Balanced accuracy""" + return float(balanced_accuracy_score(self.truth, self.predictions)) - def max_pce(self): - """max per class error""" - return max(self._per_class_errors()) + def f05(self): + """F-beta 0.5""" + return self._fbeta(0.5) def f1(self): - return float(f1_score(self.truth, self.predictions, labels=self.labels)) + """F-beta 1""" + return self._fbeta(1) + + def f2(self): + """F-beta 2""" + return self._fbeta(2) def logloss(self): + """Log Loss""" return float(log_loss(self.truth, self.probabilities, labels=self.labels)) + def max_pce(self): + """Max per Class Error""" + return max(self._per_class_errors()) + + def mean_pce(self): + """Mean per Class Error""" + return statistics.mean(self._per_class_errors()) + + def pr_auc(self): + """Precision Recall AUC""" + if self.type != DatasetType.binary: + log.warning("PR AUC metric is only available for binary problems.") + return nan + # precision, recall, thresholds = precision_recall_curve(self.truth, self.probabilities[:, 1]) + # return float(auc(recall, precision)) + return float(average_precision_score(self.truth, self.probabilities[:, 1])) + def _autoencode(self, vec): needs_encoding = not _encode_predictions_and_truth_ or (isinstance(vec[0], str) and not vec[0].isdigit()) return self.target.label_encoder.transform(vec) if needs_encoding else vec + def _auc_multi(self, mc='raise'): + average = ClassificationResult.multi_class_average + return float(roc_auc_score(self.truth, self.probabilities, average=average, labels=self.labels, multi_class=mc)) + + def _cm(self): + return confusion_matrix(self.truth, self.predictions, labels=self.labels) + + def _fbeta(self, beta): + average = ClassificationResult.multi_class_average if self.truth == DatasetType.multiclass else 'binary' + return float(fbeta_score(self.truth, self.predictions, beta=beta, average=average, labels=self.labels)) + + def _per_class_errors(self): + return [(s-d)/s for s, d in ((sum(r), r[i]) for i, r in enumerate(self._cm()))] + + class RegressionResult(Result): @@ -510,24 +556,34 @@ def __init__(self, predictions_df, info=None): self.type = DatasetType.regression def mae(self): + """Mean Absolute Error""" return float(mean_absolute_error(self.truth, self.predictions)) def mse(self): + """Mean Squared Error""" return float(mean_squared_error(self.truth, self.predictions)) def msle(self): + """Mean Squared Logarithmic Error""" return float(mean_squared_log_error(self.truth, self.predictions)) def rmse(self): + """Root Mean Square Error""" return math.sqrt(self.mse()) def rmsle(self): + """Root Mean Square Logarithmic Error""" return math.sqrt(self.msle()) def r2(self): + """R^2""" return float(r2_score(self.truth, self.predictions)) +def higher_is_better(metric): + return re.fullmatch(r"((pr_)?auc(_\w*)?)|(\w*acc)|(f\d+)|(r2)", metric) + + _encode_predictions_and_truth_ = False save_predictions = TaskResult.save_predictions diff --git a/amlb_report/results.py b/amlb_report/results.py index 9f03293f7..2bb0ca6f8 100644 --- a/amlb_report/results.py +++ b/amlb_report/results.py @@ -1,11 +1,12 @@ """ -Loading results, formatting and adding columns -result is the raw result metric computed from predictions at the end the benchmark. For classification problems, it is usually auc for binomial classification and logloss for multinomial classification. -score ensures a standard comparison between tasks: higher is always better. -norm_score is a normalization of score on a [0, 1] scale, with {{zero_one_refs[0]}} score as 0 and {{zero_one_refs[1]}} score as 1. -imp_result and imp_score for imputed results/scores. Given a task and a framework: -if all folds results/scores are missing, then no imputation occurs, and the result is nan for each fold. -if only some folds results/scores are missing, then the missing result is imputed by the {{imp_framework}} result for this fold. +Loading results, formatting and adding columns. +result is the raw result metric computed from predictions at the end the benchmark: higher is always better! + - For classification problems, it is usually auc for binary problems and negative log loss for multiclass problems. + - For regression problems, it is usually negative rmse. +norm_result is a normalization of result on a [0, 1] scale, with {{zero_one_refs[0]}} scoring as 0 and {{zero_one_refs[1]}} scoring as 1. +imp_result for imputed results. Given a task and a framework: + - if all folds results are missing, then no imputation occurs, and the result is nan for each fold. + - if only some folds results are missing, then the missing result is imputed by the {{imp_framework}} result for this fold. """ import numpy as np @@ -52,35 +53,21 @@ def imputed(row): return pd.isna(row.result) and pd.notna(row.imp_result) -fit_metrics = ['auc', 'acc', 'r2'] - - -def metric_type(row, res_col='result'): - return 'fit' if any([row[res_col] == getattr(row, m, None) for m in fit_metrics]) else 'loss' - - -def score(row, res_col='result'): - return (row[res_col] if row['metric_type'] == 'fit' - else - row[res_col]) - - -def norm_score(row, score_col='score', - zero_one_refs=None, ref_results=None, - aggregation=None): +def norm_result(row, res_col='result', zero_one_refs=None, ref_results=None, aggregation=None): if zero_one_refs is None: - return row[score_col] + return row[res_col] def get_val(ref, default): try: if isinstance(ref, str): return (ref_results.loc[(ref_results.framework == ref) & (ref_results.task == row.task)] - [score_col] + [res_col] .agg(aggregation) if aggregation else ref_results.loc[(ref_results.framework == ref) & (ref_results.task == row.task) & (ref_results.fold == row.fold)] - [score_col] + [res_col] .item()) else: return ref @@ -89,9 +76,9 @@ def get_val(ref, default): # return default zero, one = (get_val(ref, i) for i, ref in enumerate(zero_one_refs)) - rel_score = (row[score_col] - zero) / (one - zero) - return (- rel_score if row['metric_type'] == 'loss' and one < 0 <= zero - else rel_score) + norm_res = (row[res_col] - zero) / (one - zero) + return (- norm_res if row['metric'].startswith("neg_") and one < 0 <= zero + else norm_res) def sorted_ints(arr): @@ -117,7 +104,8 @@ def prepare_results(results, imputation=None, normalization=None, ref_results=None, - duplicates_handling='fail' # other options are 'keep_first', 'keep_last', 'keep_none' + duplicates_handling='fail', # other options are 'keep_first', 'keep_last', 'keep_none' + include_metadata=False ): if results is None or len(results) == 0: return None @@ -139,7 +127,7 @@ def prepare_results(results, folds = results.fold.unique() - metadata = load_dataset_metadata(results) + metadata = load_dataset_metadata(results) if include_metadata else {} done = results.set_index(['task', 'fold', 'framework']) done = remove_duplicates(done, handling=duplicates_handling) @@ -158,9 +146,8 @@ def prepare_results(results, # extending the data frame results = results.append(missing.reset_index()) - results['type'] = [task_prop(row, metadata, 'type') for _, row in results.iterrows()] - results['metric_type'] = [metric_type(row) for _, row in results.iterrows()] - results['score'] = [score(row) for _, row in results.iterrows()] + if 'type' not in results: + results['type'] = [task_prop(row, metadata, 'type') for _, row in results.iterrows()] if ref_results is None: ref_results = results @@ -177,18 +164,14 @@ def prepare_results(results, imp_framework=imp_fr, imp_results=ref_results, imp_value=imp_val, aggregation=aggr) for _, row in results.iterrows()] - results['imp_score'] = [impute_result(row, results, 'score', - imp_framework=imp_fr, imp_results=ref_results, - imp_value=imp_val, aggregation=aggr) - for _, row in results.iterrows()] if normalization is not None: - score_col = 'imp_score' if imputation is not None else 'score' + res_col = 'imp_result' if imputation is not None else 'result' zero_one = normalization[0:2] aggr = normalization[2] if len(normalization) > 2 else None - results['norm_score'] = [norm_score(row, score_col, - zero_one_refs=zero_one, ref_results=ref_results, aggregation=aggr) - for _, row in results.iterrows()] + results['norm_result'] = [norm_result(row, res_col, + zero_one_refs=zero_one, ref_results=ref_results, aggregation=aggr) + for _, row in results.iterrows()] return Namespace( results=results, diff --git a/reports/reports.ipynb b/reports/reports.ipynb index 35908f77c..109022fc1 100644 --- a/reports/reports.ipynb +++ b/reports/reports.ipynb @@ -100,12 +100,11 @@ "source": [ "#### Loading results, formatting and adding columns\n", "- `result` is the raw result metric computed from predictions at the end the benchmark.\n", - " For classification problems, it is usually `auc` for binomial classification and `logloss` for multinomial classification.\n", - "- `score` ensures a standard comparison between tasks: **higher is always better**.\n", - "- `norm_score` is a normalization of `score` on a `[0, 1]` scale, with `{{normalization[0]}}` score as `0` and `{{normalization[1]}}` score as `1`.\n", - "- `imp_result` and `imp_score` for imputed results/scores. Given a task and a framework:\n", - " - if **all folds results/scores are missing**, then no imputation occurs, and the result is `nan` for each fold.\n", - " - if **only some folds results/scores are missing**, then the missing result can be imputed by setting `{{imputation='framework'}}` and use that framework to impute the result for this fold." + " For classification problems, it is usually `auc` for binomial classification and `neg_logloss` for multinomial classification (higher is always better).\n", + "- `norm_result` is a normalization of `result` on a `[0, 1]` scale, with `{{normalization[0]}}` result as `0` and `{{normalization[1]}}` result as `1`.\n", + "- `imp_result` for imputed results. Given a task and a framework:\n", + " - if **all folds results are missing**, then no imputation occurs, and the result is `nan` for each fold.\n", + " - if **only some folds results are missing**, then the missing result can be imputed by setting `{{imputation='framework'}}` and use that framework to impute the result for this fold." ] }, { @@ -144,11 +143,8 @@ "# row_filter = lamdba r: r.fold == 0 #! r is a pd.Series\n", "title_extra = \"\"\n", "binary_result_label = 'AUC'\n", - "binary_score_label = 'AUC'\n", - "multiclass_result_label = 'logloss'\n", - "multiclass_score_label = 'neg. logloss'\n", - "regression_result_label = 'RMSE'\n", - "regression_score_label = 'neg. RMSE'\n", + "multiclass_result_label = 'neg. Log loss'\n", + "regression_result_label = 'neg. RMSE'\n", "\n", "# register_colormap(config.colormap, ('colorblind', [1, 0, 2, 3, 4, 5]))" ] @@ -255,6 +251,7 @@ " exclusions=excluded_frameworks,\n", " normalization=normalization,\n", " duplicates_handling=duplicates_handling,\n", + " include_metadata=True\n", " ) \n", " for name, run in runs.items() if runs[name].get('ref', False)}" ] @@ -474,11 +471,11 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Averaging using arithmetic mean over fold `result` or `score`.\n", - "In following summaries, if not mentioned otherwise, and if scores imputation was enabled, the means are computed over imputed results/scores .\n", + "Averaging using arithmetic mean over fold `result`.\n", + "In following summaries, if not mentioned otherwise, and if results imputation was enabled, the means are computed over imputed results .\n", "Given a task and a framework:\n", - "- if **all folds results/scores are missing**, then no imputation occured, and the mean result is `nan`.\n", - "- if **only some folds results/scores are missing**, then the amount of imputed results that contributed to the mean are displayed between parenthesis." + "- if **all folds results are missing**, then no imputation occured, and the mean result is `nan`.\n", + "- if **only some folds results are missing**, then the amount of imputed results that contributed to the mean are displayed between parenthesis." ] }, { @@ -522,24 +519,6 @@ "res_summary.to_csv(create_file(output_dir, \"tables\", \"results_summary.csv\"))" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Score mean" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "score_summary = render_summary('score', \n", - " results=all_res)\n", - "score_summary.to_csv(create_file(output_dir, \"tables\", \"score_summary.csv\"))" - ] - }, { "cell_type": "code", "execution_count": null, @@ -547,9 +526,9 @@ "outputs": [], "source": [ "if normalization:\n", - " norm_score_summary = render_summary('norm_score', \n", - " results=all_res)\n", - " norm_score_summary.to_csv(create_file(output_dir, \"tables\", \"normalized_score_summary.csv\"))" + " norm_result_summary = render_summary('norm_result', \n", + " results=all_res)\n", + " norm_result_summary.to_csv(create_file(output_dir, \"tables\", \"normalized_result_summary.csv\"))" ] }, { @@ -565,7 +544,7 @@ "metadata": {}, "outputs": [], "source": [ - "benchmark_leaderboard = render_leaderboard('score', \n", + "benchmark_leaderboard = render_leaderboard('result', \n", " results=all_res,\n", " aggregate=True)\n", "benchmark_leaderboard.to_csv(create_file(output_dir, \"tables\", \"benchmark_leaderboard.csv\"))" @@ -575,7 +554,7 @@ "cell_type": "raw", "metadata": {}, "source": [ - "folds_leaderboard = render_leaderboard('score', \n", + "folds_leaderboard = render_leaderboard('result', \n", " results=all_res,\n", " aggregate=False)\n", "folds_leaderboard.to_csv(create_file(output_dir, \"tables\", \"folds_leaderboard.csv\"))" @@ -611,17 +590,17 @@ "outputs": [], "source": [ "if 'binary' in problem_types:\n", - " fig = draw_score_heatmap('score',\n", + " fig = draw_score_heatmap('result',\n", " results=all_res,\n", " type_filter='binary', \n", " metadata=metadata,\n", " x_labels=frameworks_labels or True,\n", " x_sort_by=frameworks_sort_key,\n", " y_sort_by='nrows',\n", - " title=f\"Scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}\",\n", + " title=f\"Results ({binary_result_label}) on {results_group} binary classification problems{title_extra}\",\n", " center=0.5\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"binary_score_heat.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"binary_result_heat.png\"))" ] }, { @@ -631,17 +610,17 @@ "outputs": [], "source": [ "if 'multiclass' in problem_types:\n", - " fig = draw_score_heatmap('score', \n", + " fig = draw_score_heatmap('result', \n", " results=all_res,\n", " type_filter='multiclass', \n", " metadata=metadata,\n", " x_labels=frameworks_labels or True,\n", " x_sort_by=frameworks_sort_key,\n", " y_sort_by='nrows',\n", - " title=f\"Scores ({multiclass_score_label}) on {results_group} multi-class classification problems{title_extra}\",\n", + " title=f\"Results ({multiclass_result_label}) on {results_group} multi-class classification problems{title_extra}\",\n", " center=0\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_score_heat.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_result_heat.png\"))" ] }, { @@ -651,17 +630,17 @@ "outputs": [], "source": [ "if 'regression' in problem_types:\n", - " fig = draw_score_heatmap('score', \n", + " fig = draw_score_heatmap('result', \n", " results=all_res,\n", " type_filter='regression', \n", " metadata=metadata,\n", " x_labels=frameworks_labels or True,\n", " x_sort_by=frameworks_sort_key,\n", " y_sort_by='nrows',\n", - " title=f\"Scores ({regression_score_label}) on {results_group} regression problems{title_extra}\",\n", + " title=f\"Results ({regression_result_label}) on {results_group} regression problems{title_extra}\",\n", " center=0\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"regression_score_heat.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"regression_result_heat.png\"))" ] }, { @@ -678,20 +657,20 @@ "outputs": [], "source": [ "if 'binary' in problem_types:\n", - " fig = draw_score_barplot('score',\n", + " fig = draw_score_barplot('result',\n", " results=all_res,\n", " type_filter='binary', \n", " metadata=metadata,\n", " x_sort_by=tasks_sort_by,\n", - " ylabel=binary_score_label,\n", + " ylabel=binary_result_label,\n", " ylim=dict(bottom=.5),\n", " hue_sort_by=frameworks_sort_key, \n", " ci=95,\n", - " title=f\"Scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}\",\n", + " title=f\"Results ({binary_result_label}) on {results_group} binary classification problems{title_extra}\",\n", " legend_loc='lower center',\n", " legend_labels=frameworks_labels,\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"binary_score_barplot.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"binary_result_barplot.png\"))" ] }, { @@ -701,20 +680,20 @@ "outputs": [], "source": [ "if 'multiclass' in problem_types:\n", - " fig = draw_score_barplot('score',\n", + " fig = draw_score_barplot('result',\n", " results=all_res,\n", " type_filter='multiclass', \n", " metadata=metadata,\n", " x_sort_by=tasks_sort_by,\n", - " ylabel=multiclass_score_label,\n", + " ylabel=multiclass_result_label,\n", " ylim=dict(top=0.1),\n", " hue_sort_by=frameworks_sort_key,\n", " ci=95,\n", - " title=f\"Scores ({multiclass_score_label}) on {results_group} multiclass classification problems{title_extra}\",\n", + " title=f\"Results ({multiclass_result_label}) on {results_group} multiclass classification problems{title_extra}\",\n", " legend_loc='lower center',\n", " legend_labels=frameworks_labels,\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_score_barplot.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_result_barplot.png\"))" ] }, { @@ -724,22 +703,22 @@ "outputs": [], "source": [ "if 'regression' in problem_types:\n", - " fig = draw_score_barplot('score',\n", + " fig = draw_score_barplot('result',\n", " results=all_res,\n", " type_filter='regression', \n", " metadata=metadata,\n", " x_sort_by=tasks_sort_by,\n", " yscale='symlog',\n", - " ylabel=regression_score_label,\n", + " ylabel=regression_result_label,\n", " ylim=dict(top=0.1),\n", " hue_sort_by=frameworks_sort_key, \n", " ci=95,\n", - " title=f\"Scores ({regression_score_label}) on {results_group} regression classification problems{title_extra}\",\n", + " title=f\"Results ({regression_result_label}) on {results_group} regression classification problems{title_extra}\",\n", " legend_loc='lower center',\n", " legend_labels=frameworks_labels,\n", " size=(8, 6),\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"regression_score_barplot.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"regression_result_barplot.png\"))" ] }, { @@ -756,20 +735,20 @@ "outputs": [], "source": [ "if 'binary' in problem_types:\n", - " fig = draw_score_pointplot('score',\n", + " fig = draw_score_pointplot('result',\n", " results=all_res,\n", " type_filter='binary', \n", " metadata=metadata,\n", " x_sort_by=tasks_sort_by,\n", - " ylabel=binary_score_label,\n", + " ylabel=binary_result_label,\n", " ylim=dict(bottom=.5),\n", " hue_sort_by=frameworks_sort_key,\n", " join='none', marker='hline_xspaced', ci=95, \n", - " title=f\"Scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}\",\n", + " title=f\"Results ({binary_result_label}) on {results_group} binary classification problems{title_extra}\",\n", " legend_loc='lower center',\n", " legend_labels=frameworks_labels,\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"binary_score_pointplot.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"binary_result_pointplot.png\"))" ] }, { @@ -779,19 +758,19 @@ "outputs": [], "source": [ "if 'multiclass' in problem_types:\n", - " fig = draw_score_pointplot('score',\n", + " fig = draw_score_pointplot('result',\n", " results=all_res,\n", " type_filter='multiclass', \n", " metadata=metadata,\n", " x_sort_by=tasks_sort_by,\n", - " ylabel=multiclass_score_label,\n", + " ylabel=multiclass_result_label,\n", " hue_sort_by=frameworks_sort_key,\n", " join='none', marker='hline_xspaced', ci=95, \n", - " title=f\"Scores ({multiclass_score_label}) on {results_group} multiclass classification problems{title_extra}\",\n", + " title=f\"Results ({multiclass_result_label}) on {results_group} multiclass classification problems{title_extra}\",\n", " legend_loc='lower center',\n", " legend_labels=frameworks_labels,\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_score_pointplot.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_result_pointplot.png\"))" ] }, { @@ -801,22 +780,22 @@ "outputs": [], "source": [ "if 'regression' in problem_types:\n", - " fig = draw_score_pointplot('score',\n", + " fig = draw_score_pointplot('result',\n", " results=all_res,\n", " type_filter='regression', \n", " metadata=metadata,\n", " x_sort_by=tasks_sort_by,\n", - " ylabel=regression_score_label,\n", + " ylabel=regression_result_label,\n", " yscale='symlog',\n", " ylim=dict(top=0.1),\n", " hue_sort_by=frameworks_sort_key,\n", " join='none', marker='hline_xspaced', ci=95, \n", - " title=f\"Scores ({regression_score_label}) on {results_group} regression classification problems{title_extra}\",\n", + " title=f\"Results ({regression_result_label}) on {results_group} regression classification problems{title_extra}\",\n", " legend_loc='lower center',\n", " legend_labels=frameworks_labels,\n", " size=(8, 6),\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"regression_score_pointplot.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"regression_result_pointplot.png\"))" ] }, { @@ -833,17 +812,17 @@ "outputs": [], "source": [ "if 'binary' in problem_types:\n", - " fig = draw_score_stripplot('score', \n", + " fig = draw_score_stripplot('result', \n", " results=all_res.sort_values(by=['framework']),\n", " type_filter='binary', \n", " metadata=metadata,\n", - " xlabel=binary_score_label,\n", + " xlabel=binary_result_label,\n", " y_sort_by=tasks_sort_by,\n", " hue_sort_by=frameworks_sort_key,\n", - " title=f\"Scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}\",\n", + " title=f\"Results ({binary_result_label}) on {results_group} binary classification problems{title_extra}\",\n", " legend_labels=frameworks_labels,\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"binary_score_stripplot.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"binary_result_stripplot.png\"))" ] }, { @@ -853,18 +832,18 @@ "outputs": [], "source": [ "if 'multiclass' in problem_types:\n", - " fig = draw_score_stripplot('score', \n", + " fig = draw_score_stripplot('result', \n", " results=all_res.sort_values(by=['framework']),\n", " type_filter='multiclass', \n", " metadata=metadata,\n", - " xlabel=multiclass_score_label,\n", + " xlabel=multiclass_result_label,\n", " xscale='symlog',\n", " y_sort_by=tasks_sort_by,\n", " hue_sort_by=frameworks_sort_key,\n", - " title=f\"Scores ({multiclass_score_label}) on {results_group} multi-class classification problems{title_extra}\",\n", + " title=f\"Results ({multiclass_result_label}) on {results_group} multi-class classification problems{title_extra}\",\n", " legend_labels=frameworks_labels,\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_score_stripplot.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_result_stripplot.png\"))" ] }, { @@ -874,18 +853,18 @@ "outputs": [], "source": [ "if 'regression' in problem_types:\n", - " fig = draw_score_stripplot('score', \n", + " fig = draw_score_stripplot('result', \n", " results=all_res,\n", " type_filter='regression', \n", " metadata=metadata,\n", - " xlabel=regression_score_label,\n", + " xlabel=regression_result_label,\n", " xscale='symlog',\n", " y_sort_by=tasks_sort_by,\n", " hue_sort_by=frameworks_sort_key,\n", - " title=f\"Scores ({regression_score_label}) on {results_group} regression problems{title_extra}\",\n", + " title=f\"Results ({regression_result_label}) on {results_group} regression problems{title_extra}\",\n", " legend_labels=frameworks_labels,\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"regression_score_stripplot.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"regression_result_stripplot.png\"))" ] }, { @@ -902,17 +881,17 @@ "outputs": [], "source": [ "if 'binary' in problem_types and normalization:\n", - " fig = draw_score_stripplot('norm_score', \n", + " fig = draw_score_stripplot('norm_result', \n", " results=all_res,\n", " type_filter='binary', \n", " metadata=metadata,\n", - " xlabel=f\"rel. {binary_score_label}\",\n", + " xlabel=f\"rel. {binary_result_label}\",\n", " y_sort_by='nrows',\n", " hue_sort_by=frameworks_sort_key,\n", - " title=f\"Relative scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}\",\n", + " title=f\"Relative results ({binary_result_label}) on {results_group} binary classification problems{title_extra}\",\n", " legend_labels=frameworks_labels,\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"binary_rel_score_stripplot.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"binary_rel_result_stripplot.png\"))" ] }, { @@ -922,18 +901,18 @@ "outputs": [], "source": [ "if 'multiclass' in problem_types and normalization:\n", - " fig = draw_score_stripplot('norm_score', \n", + " fig = draw_score_stripplot('norm_result', \n", " results=all_res,\n", " type_filter='multiclass', \n", " metadata=metadata,\n", - " xlabel=f\"rel. {multiclass_score_label}\",\n", + " xlabel=f\"rel. {multiclass_result_label}\",\n", " xscale='symlog',\n", " y_sort_by='nrows',\n", " hue_sort_by=frameworks_sort_key,\n", - " title=f\"Relative scores ({multiclass_score_label}) on {results_group} multi-class classification problems{title_extra}\",\n", + " title=f\"Relative results ({multiclass_result_label}) on {results_group} multi-class classification problems{title_extra}\",\n", " legend_labels=frameworks_labels,\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_rel_score_stripplot.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_rel_result_stripplot.png\"))" ] }, { @@ -943,17 +922,17 @@ "outputs": [], "source": [ "if 'regression' in problem_types and normalization:\n", - " fig = draw_score_stripplot('norm_score', \n", + " fig = draw_score_stripplot('norm_result', \n", " results=all_res,\n", " type_filter='regression', \n", " metadata=metadata,\n", - " xlabel=f\"rel. {regression_score_label}\",\n", + " xlabel=f\"rel. {regression_result_label}\",\n", " y_sort_by='nrows',\n", " hue_sort_by=frameworks_sort_key,\n", - " title=f\"Relative scores ({regression_score_label}) on {results_group} regression problems{title_extra}\",\n", + " title=f\"Relative results ({regression_result_label}) on {results_group} regression problems{title_extra}\",\n", " legend_labels=frameworks_labels,\n", " );\n", - " savefig(fig, create_file(output_dir, \"visualizations\", \"regression_rel_score_stripplot.png\"))" + " savefig(fig, create_file(output_dir, \"visualizations\", \"regression_rel_result_stripplot.png\"))" ] }, { diff --git a/resources/config.yaml b/resources/config.yaml index e217613a7..a1dfbf1ec 100644 --- a/resources/config.yaml +++ b/resources/config.yaml @@ -50,10 +50,12 @@ benchmarks: os_mem_size_mb: 2048 # the default amount of memory left to the OS when task assigned memory is computed automatically. os_vol_size_mb: 2048 # the default amount of volume left to the OS when task volume memory is verified. overhead_time_seconds: 3600 # amount of additional time allowed for the job to complete before sending an interruption signal - metrics: # default metrics by dataset type (as listed by amlb.data.DatasetType), only the first metric is optimized by the frameworks, the others are computed only for information purpose. - binary: ['auc', 'logloss', 'acc', 'balacc'] - multiclass: ['logloss', 'acc', 'balacc'] - regression: ['rmse', 'r2', 'mae'] + metrics: # default metrics by dataset type (as listed by amlb.data.DatasetType), + # only the first metric is optimized by the frameworks, + # the others are computed only for information purpose. + binary: ['auc', 'logloss', 'acc', 'balacc'] # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error). + multiclass: ['logloss', 'acc', 'balacc'] # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average. + regression: ['rmse', 'r2', 'mae'] # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2). defaults: folds: 10 max_runtime_seconds: 3600