diff --git a/amlb/datautils.py b/amlb/datautils.py
index 1d4a31bd2..3e0f967a4 100644
--- a/amlb/datautils.py
+++ b/amlb/datautils.py
@@ -14,7 +14,9 @@
 import numpy as np
 import pandas as pd
 from sklearn.base import TransformerMixin
-from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, log_loss, balanced_accuracy_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, roc_auc_score  # just aliasing
+from sklearn.metrics import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, \
+    log_loss, mean_absolute_error, mean_squared_error, mean_squared_log_error, precision_recall_curve, \
+    r2_score, roc_auc_score  # just aliasing
 from sklearn.preprocessing import LabelEncoder, LabelBinarizer, OneHotEncoder
 
 from .utils import profile, path_from_split, repr_def, split_path, touch
diff --git a/amlb/results.py b/amlb/results.py
index 3fa6ac37c..3cb5f0727 100644
--- a/amlb/results.py
+++ b/amlb/results.py
@@ -16,7 +16,9 @@
 import pandas as pd
 
 from .data import Dataset, DatasetType, Feature
-from .datautils import accuracy_score, confusion_matrix, f1_score, log_loss, balanced_accuracy_score, mean_absolute_error, mean_squared_error, mean_squared_log_error, r2_score, roc_auc_score, read_csv, write_csv, is_data_frame, to_data_frame
+from .datautils import accuracy_score, auc, average_precision_score, balanced_accuracy_score, confusion_matrix, fbeta_score, log_loss, \
+    mean_absolute_error, mean_squared_error, mean_squared_log_error, precision_recall_curve, r2_score, roc_auc_score, \
+    read_csv, write_csv, is_data_frame, to_data_frame
 from .resources import get as rget, config as rconfig, output_dirs
 from .utils import Namespace, backup_file, cached, datetime_iso, json_load, memoize, profile
 
@@ -394,6 +396,10 @@ def do_score(m):
         for metric in metadata.metrics or []:
             scores[metric] = do_score(metric)
         scores.result = scores[scores.metric] if scores.metric in scores else do_score(scores.metric)
+        if not higher_is_better(scores.metric):
+            scores.metric = f"neg_{scores.metric}"
+            scores.result = - scores.result
+
         scores.info = result.info
         if scoring_errors:
             scores.info = "; ".join(filter(lambda it: it, [scores.info, *scoring_errors]))
@@ -453,6 +459,8 @@ def __init__(self, error):
 
 class ClassificationResult(Result):
 
+    multi_class_average = 'weighted'  # used by metrics like fbeta or auc
+
     def __init__(self, predictions_df, info=None):
         super().__init__(predictions_df, info)
         self.classes = self.df.columns[:-2].values.astype(str, copy=False)
@@ -464,42 +472,80 @@ def __init__(self, predictions_df, info=None):
         self.labels = self._autoencode(self.classes)
 
     def acc(self):
+        """Accuracy"""
         return float(accuracy_score(self.truth, self.predictions))
 
-    def balacc(self):
-        return float(balanced_accuracy_score(self.truth, self.predictions))
-
     def auc(self):
+        """Array Under (ROC) Curve, computed on probabilities, not on predictions"""
         if self.type != DatasetType.binary:
-            # raise ValueError("AUC metric is only supported for binary classification: {}.".format(self.classes))
-            log.warning("AUC metric is only supported for binary classification: %s.", self.labels)
+            log.warning("For multiclass problems, please use `auc_ovr` or `auc_ovo` metrics instead of `auc`.")
             return nan
-        return float(roc_auc_score(self.truth, self.probabilities[:, 1], labels=self.labels))
+        return float(roc_auc_score(self.truth, self.probabilities[:, 1]))
 
-    def cm(self):
-        return confusion_matrix(self.truth, self.predictions, labels=self.labels)
+    def auc_ovo(self):
+        """AUC One-vs-One"""
+        return self._auc_multi(mc='ovo')
 
-    def _per_class_errors(self):
-        return [(s-d)/s for s, d in ((sum(r), r[i]) for i, r in enumerate(self.cm()))]
+    def auc_ovr(self):
+        """AUC One-vs-Rest"""
+        return self._auc_multi(mc='ovr')
 
-    def mean_pce(self):
-        """mean per class error"""
-        return statistics.mean(self._per_class_errors())
+    def balacc(self):
+        """Balanced accuracy"""
+        return float(balanced_accuracy_score(self.truth, self.predictions))
 
-    def max_pce(self):
-        """max per class error"""
-        return max(self._per_class_errors())
+    def f05(self):
+        """F-beta 0.5"""
+        return self._fbeta(0.5)
 
     def f1(self):
-        return float(f1_score(self.truth, self.predictions, labels=self.labels))
+        """F-beta 1"""
+        return self._fbeta(1)
+
+    def f2(self):
+        """F-beta 2"""
+        return self._fbeta(2)
 
     def logloss(self):
+        """Log Loss"""
         return float(log_loss(self.truth, self.probabilities, labels=self.labels))
 
+    def max_pce(self):
+        """Max per Class Error"""
+        return max(self._per_class_errors())
+
+    def mean_pce(self):
+        """Mean per Class Error"""
+        return statistics.mean(self._per_class_errors())
+
+    def pr_auc(self):
+        """Precision Recall AUC"""
+        if self.type != DatasetType.binary:
+            log.warning("PR AUC metric is only available for binary problems.")
+            return nan
+        # precision, recall, thresholds = precision_recall_curve(self.truth, self.probabilities[:, 1])
+        # return float(auc(recall, precision))
+        return float(average_precision_score(self.truth, self.probabilities[:, 1]))
+
     def _autoencode(self, vec):
         needs_encoding = not _encode_predictions_and_truth_ or (isinstance(vec[0], str) and not vec[0].isdigit())
         return self.target.label_encoder.transform(vec) if needs_encoding else vec
 
+    def _auc_multi(self, mc='raise'):
+        average = ClassificationResult.multi_class_average
+        return float(roc_auc_score(self.truth, self.probabilities, average=average, labels=self.labels, multi_class=mc))
+
+    def _cm(self):
+        return confusion_matrix(self.truth, self.predictions, labels=self.labels)
+
+    def _fbeta(self, beta):
+        average = ClassificationResult.multi_class_average if self.truth == DatasetType.multiclass else 'binary'
+        return float(fbeta_score(self.truth, self.predictions, beta=beta, average=average, labels=self.labels))
+
+    def _per_class_errors(self):
+        return [(s-d)/s for s, d in ((sum(r), r[i]) for i, r in enumerate(self._cm()))]
+
+
 
 class RegressionResult(Result):
 
@@ -510,24 +556,34 @@ def __init__(self, predictions_df, info=None):
         self.type = DatasetType.regression
 
     def mae(self):
+        """Mean Absolute Error"""
         return float(mean_absolute_error(self.truth, self.predictions))
 
     def mse(self):
+        """Mean Squared Error"""
         return float(mean_squared_error(self.truth, self.predictions))
 
     def msle(self):
+        """Mean Squared Logarithmic Error"""
         return float(mean_squared_log_error(self.truth, self.predictions))
 
     def rmse(self):
+        """Root Mean Square Error"""
         return math.sqrt(self.mse())
 
     def rmsle(self):
+        """Root Mean Square Logarithmic Error"""
         return math.sqrt(self.msle())
 
     def r2(self):
+        """R^2"""
         return float(r2_score(self.truth, self.predictions))
 
 
+def higher_is_better(metric):
+    return re.fullmatch(r"((pr_)?auc(_\w*)?)|(\w*acc)|(f\d+)|(r2)", metric)
+
+
 _encode_predictions_and_truth_ = False
 
 save_predictions = TaskResult.save_predictions
diff --git a/amlb_report/results.py b/amlb_report/results.py
index 9f03293f7..2bb0ca6f8 100644
--- a/amlb_report/results.py
+++ b/amlb_report/results.py
@@ -1,11 +1,12 @@
 """
-Loading results, formatting and adding columns
-result is the raw result metric computed from predictions at the end the benchmark. For classification problems, it is usually auc for binomial classification and logloss for multinomial classification.
-score ensures a standard comparison between tasks: higher is always better.
-norm_score is a normalization of score on a [0, 1] scale, with {{zero_one_refs[0]}} score as 0 and {{zero_one_refs[1]}} score as 1.
-imp_result and imp_score for imputed results/scores. Given a task and a framework:
-if all folds results/scores are missing, then no imputation occurs, and the result is nan for each fold.
-if only some folds results/scores are missing, then the missing result is imputed by the {{imp_framework}} result for this fold.
+Loading results, formatting and adding columns.
+result is the raw result metric computed from predictions at the end the benchmark: higher is always better!
+ - For classification problems, it is usually auc for binary problems and negative log loss for multiclass problems.
+ - For regression problems, it is usually negative rmse.
+norm_result is a normalization of result on a [0, 1] scale, with {{zero_one_refs[0]}} scoring as 0 and {{zero_one_refs[1]}} scoring as 1.
+imp_result for imputed results. Given a task and a framework:
+ - if all folds results are missing, then no imputation occurs, and the result is nan for each fold.
+ - if only some folds results are missing, then the missing result is imputed by the {{imp_framework}} result for this fold.
 """
 
 import numpy as np
@@ -52,35 +53,21 @@ def imputed(row):
     return pd.isna(row.result) and pd.notna(row.imp_result)
 
 
-fit_metrics = ['auc', 'acc', 'r2']
-
-
-def metric_type(row, res_col='result'):
-    return 'fit' if any([row[res_col] == getattr(row, m, None) for m in fit_metrics]) else 'loss'
-
-
-def score(row, res_col='result'):
-    return (row[res_col] if row['metric_type'] == 'fit'
-            else - row[res_col])
-
-
-def norm_score(row, score_col='score',
-               zero_one_refs=None, ref_results=None,
-               aggregation=None):
+def norm_result(row, res_col='result', zero_one_refs=None, ref_results=None, aggregation=None):
     if zero_one_refs is None:
-        return row[score_col]
+        return row[res_col]
 
     def get_val(ref, default):
         try:
             if isinstance(ref, str):
                 return (ref_results.loc[(ref_results.framework == ref)
                                         & (ref_results.task == row.task)]
-                                       [score_col]
+                                       [res_col]
                                    .agg(aggregation) if aggregation
                         else ref_results.loc[(ref_results.framework == ref)
                                              & (ref_results.task == row.task)
                                              & (ref_results.fold == row.fold)]
-                                            [score_col]
+                                            [res_col]
                                         .item())
             else:
                 return ref
@@ -89,9 +76,9 @@ def get_val(ref, default):
             # return default
 
     zero, one = (get_val(ref, i) for i, ref in enumerate(zero_one_refs))
-    rel_score = (row[score_col] - zero) / (one - zero)
-    return (- rel_score if row['metric_type'] == 'loss' and one < 0 <= zero
-            else rel_score)
+    norm_res = (row[res_col] - zero) / (one - zero)
+    return (- norm_res if row['metric'].startswith("neg_") and one < 0 <= zero
+            else norm_res)
 
 
 def sorted_ints(arr):
@@ -117,7 +104,8 @@ def prepare_results(results,
                     imputation=None,
                     normalization=None,
                     ref_results=None,
-                    duplicates_handling='fail'  # other options are 'keep_first', 'keep_last', 'keep_none'
+                    duplicates_handling='fail',  # other options are 'keep_first', 'keep_last', 'keep_none'
+                    include_metadata=False
                     ):
     if results is None or len(results) == 0:
         return None
@@ -139,7 +127,7 @@ def prepare_results(results,
 
     folds = results.fold.unique()
 
-    metadata = load_dataset_metadata(results)
+    metadata = load_dataset_metadata(results) if include_metadata else {}
 
     done = results.set_index(['task', 'fold', 'framework'])
     done = remove_duplicates(done, handling=duplicates_handling)
@@ -158,9 +146,8 @@ def prepare_results(results,
 
     # extending the data frame
     results = results.append(missing.reset_index())
-    results['type'] = [task_prop(row, metadata, 'type') for _, row in results.iterrows()]
-    results['metric_type'] = [metric_type(row) for _, row in results.iterrows()]
-    results['score'] = [score(row) for _, row in results.iterrows()]
+    if 'type' not in results:
+        results['type'] = [task_prop(row, metadata, 'type') for _, row in results.iterrows()]
 
     if ref_results is None:
         ref_results = results
@@ -177,18 +164,14 @@ def prepare_results(results,
                                                imp_framework=imp_fr, imp_results=ref_results,
                                                imp_value=imp_val, aggregation=aggr)
                                  for _, row in results.iterrows()]
-        results['imp_score'] = [impute_result(row, results, 'score',
-                                              imp_framework=imp_fr, imp_results=ref_results,
-                                              imp_value=imp_val, aggregation=aggr)
-                                for _, row in results.iterrows()]
 
     if normalization is not None:
-        score_col = 'imp_score' if imputation is not None else 'score'
+        res_col = 'imp_result' if imputation is not None else 'result'
         zero_one = normalization[0:2]
         aggr = normalization[2] if len(normalization) > 2 else None
-        results['norm_score'] = [norm_score(row, score_col,
-                                            zero_one_refs=zero_one, ref_results=ref_results, aggregation=aggr)
-                                 for _, row in results.iterrows()]
+        results['norm_result'] = [norm_result(row, res_col,
+                                              zero_one_refs=zero_one, ref_results=ref_results, aggregation=aggr)
+                                  for _, row in results.iterrows()]
 
     return Namespace(
         results=results,
diff --git a/reports/reports.ipynb b/reports/reports.ipynb
index 35908f77c..109022fc1 100644
--- a/reports/reports.ipynb
+++ b/reports/reports.ipynb
@@ -100,12 +100,11 @@
    "source": [
     "#### Loading results, formatting and adding columns\n",
     "- `result` is the raw result metric computed from predictions at the end the benchmark.\n",
-    "    For classification problems, it is usually `auc` for binomial classification and `logloss` for multinomial classification.\n",
-    "- `score` ensures a standard comparison between tasks: **higher is always better**.\n",
-    "- `norm_score` is a normalization of `score` on a `[0, 1]` scale, with `{{normalization[0]}}` score as `0` and `{{normalization[1]}}` score as `1`.\n",
-    "- `imp_result` and `imp_score` for imputed results/scores. Given a task and a framework:\n",
-    "    - if **all folds results/scores are missing**, then no imputation occurs, and the result is `nan` for each fold.\n",
-    "    - if **only some folds results/scores are missing**, then the missing result can be imputed by setting `{{imputation='framework'}}` and use that framework to impute the result for this fold."
+    "    For classification problems, it is usually `auc` for binomial classification and `neg_logloss` for multinomial classification (higher is always better).\n",
+    "- `norm_result` is a normalization of `result` on a `[0, 1]` scale, with `{{normalization[0]}}` result as `0` and `{{normalization[1]}}` result as `1`.\n",
+    "- `imp_result` for imputed results. Given a task and a framework:\n",
+    "    - if **all folds results are missing**, then no imputation occurs, and the result is `nan` for each fold.\n",
+    "    - if **only some folds results are missing**, then the missing result can be imputed by setting `{{imputation='framework'}}` and use that framework to impute the result for this fold."
    ]
   },
   {
@@ -144,11 +143,8 @@
     "# row_filter = lamdba r: r.fold == 0     #! r is a pd.Series\n",
     "title_extra = \"\"\n",
     "binary_result_label = 'AUC'\n",
-    "binary_score_label = 'AUC'\n",
-    "multiclass_result_label = 'logloss'\n",
-    "multiclass_score_label = 'neg. logloss'\n",
-    "regression_result_label = 'RMSE'\n",
-    "regression_score_label = 'neg. RMSE'\n",
+    "multiclass_result_label = 'neg. Log loss'\n",
+    "regression_result_label = 'neg. RMSE'\n",
     "\n",
     "# register_colormap(config.colormap, ('colorblind', [1, 0, 2, 3, 4, 5]))"
    ]
@@ -255,6 +251,7 @@
     "                                     exclusions=excluded_frameworks,\n",
     "                                     normalization=normalization,\n",
     "                                     duplicates_handling=duplicates_handling,\n",
+    "                                     include_metadata=True\n",
     "                                     ) \n",
     "               for name, run in runs.items() if runs[name].get('ref', False)}"
    ]
@@ -474,11 +471,11 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Averaging using arithmetic mean over fold `result` or `score`.\n",
-    "In following summaries, if not mentioned otherwise, and if scores imputation was enabled, the means are computed over imputed results/scores .\n",
+    "Averaging using arithmetic mean over fold `result`.\n",
+    "In following summaries, if not mentioned otherwise, and if results imputation was enabled, the means are computed over imputed results .\n",
     "Given a task and a framework:\n",
-    "- if **all folds results/scores are missing**, then no imputation occured, and the mean result is `nan`.\n",
-    "- if **only some folds results/scores are missing**, then the amount of imputed results that contributed to the mean are displayed between parenthesis."
+    "- if **all folds results are missing**, then no imputation occured, and the mean result is `nan`.\n",
+    "- if **only some folds results are missing**, then the amount of imputed results that contributed to the mean are displayed between parenthesis."
    ]
   },
   {
@@ -522,24 +519,6 @@
     "res_summary.to_csv(create_file(output_dir, \"tables\", \"results_summary.csv\"))"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Score mean"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "score_summary = render_summary('score', \n",
-    "                               results=all_res)\n",
-    "score_summary.to_csv(create_file(output_dir, \"tables\", \"score_summary.csv\"))"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -547,9 +526,9 @@
    "outputs": [],
    "source": [
     "if normalization:\n",
-    "    norm_score_summary = render_summary('norm_score', \n",
-    "                                        results=all_res)\n",
-    "    norm_score_summary.to_csv(create_file(output_dir, \"tables\", \"normalized_score_summary.csv\"))"
+    "    norm_result_summary = render_summary('norm_result', \n",
+    "                                         results=all_res)\n",
+    "    norm_result_summary.to_csv(create_file(output_dir, \"tables\", \"normalized_result_summary.csv\"))"
    ]
   },
   {
@@ -565,7 +544,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "benchmark_leaderboard = render_leaderboard('score', \n",
+    "benchmark_leaderboard = render_leaderboard('result', \n",
     "                                           results=all_res,\n",
     "                                           aggregate=True)\n",
     "benchmark_leaderboard.to_csv(create_file(output_dir, \"tables\", \"benchmark_leaderboard.csv\"))"
@@ -575,7 +554,7 @@
    "cell_type": "raw",
    "metadata": {},
    "source": [
-    "folds_leaderboard = render_leaderboard('score', \n",
+    "folds_leaderboard = render_leaderboard('result', \n",
     "                                        results=all_res,\n",
     "                                        aggregate=False)\n",
     "folds_leaderboard.to_csv(create_file(output_dir, \"tables\", \"folds_leaderboard.csv\"))"
@@ -611,17 +590,17 @@
    "outputs": [],
    "source": [
     "if 'binary' in problem_types:\n",
-    "    fig = draw_score_heatmap('score',\n",
+    "    fig = draw_score_heatmap('result',\n",
     "                             results=all_res,\n",
     "                             type_filter='binary', \n",
     "                             metadata=metadata,\n",
     "                             x_labels=frameworks_labels or True,\n",
     "                             x_sort_by=frameworks_sort_key,\n",
     "                             y_sort_by='nrows',\n",
-    "                             title=f\"Scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}\",\n",
+    "                             title=f\"Results ({binary_result_label}) on {results_group} binary classification problems{title_extra}\",\n",
     "                             center=0.5\n",
     "                            );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"binary_score_heat.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"binary_result_heat.png\"))"
    ]
   },
   {
@@ -631,17 +610,17 @@
    "outputs": [],
    "source": [
     "if 'multiclass' in problem_types:\n",
-    "    fig = draw_score_heatmap('score', \n",
+    "    fig = draw_score_heatmap('result', \n",
     "                             results=all_res,\n",
     "                             type_filter='multiclass', \n",
     "                             metadata=metadata,\n",
     "                             x_labels=frameworks_labels  or True,\n",
     "                             x_sort_by=frameworks_sort_key,\n",
     "                             y_sort_by='nrows',\n",
-    "                             title=f\"Scores ({multiclass_score_label}) on {results_group} multi-class classification problems{title_extra}\",\n",
+    "                             title=f\"Results ({multiclass_result_label}) on {results_group} multi-class classification problems{title_extra}\",\n",
     "                             center=0\n",
     "                            );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_score_heat.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_result_heat.png\"))"
    ]
   },
   {
@@ -651,17 +630,17 @@
    "outputs": [],
    "source": [
     "if 'regression' in problem_types:\n",
-    "    fig = draw_score_heatmap('score', \n",
+    "    fig = draw_score_heatmap('result', \n",
     "                             results=all_res,\n",
     "                             type_filter='regression', \n",
     "                             metadata=metadata,\n",
     "                             x_labels=frameworks_labels  or True,\n",
     "                             x_sort_by=frameworks_sort_key,\n",
     "                             y_sort_by='nrows',\n",
-    "                             title=f\"Scores ({regression_score_label}) on {results_group} regression problems{title_extra}\",\n",
+    "                             title=f\"Results ({regression_result_label}) on {results_group} regression problems{title_extra}\",\n",
     "                             center=0\n",
     "                            );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"regression_score_heat.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"regression_result_heat.png\"))"
    ]
   },
   {
@@ -678,20 +657,20 @@
    "outputs": [],
    "source": [
     "if 'binary' in problem_types:\n",
-    "    fig = draw_score_barplot('score',\n",
+    "    fig = draw_score_barplot('result',\n",
     "                             results=all_res,\n",
     "                             type_filter='binary', \n",
     "                             metadata=metadata,\n",
     "                             x_sort_by=tasks_sort_by,\n",
-    "                             ylabel=binary_score_label,\n",
+    "                             ylabel=binary_result_label,\n",
     "                             ylim=dict(bottom=.5),\n",
     "                             hue_sort_by=frameworks_sort_key, \n",
     "                             ci=95,\n",
-    "                             title=f\"Scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}\",\n",
+    "                             title=f\"Results ({binary_result_label}) on {results_group} binary classification problems{title_extra}\",\n",
     "                             legend_loc='lower center',\n",
     "                             legend_labels=frameworks_labels,\n",
     "                            );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"binary_score_barplot.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"binary_result_barplot.png\"))"
    ]
   },
   {
@@ -701,20 +680,20 @@
    "outputs": [],
    "source": [
     "if 'multiclass' in problem_types:\n",
-    "    fig = draw_score_barplot('score',\n",
+    "    fig = draw_score_barplot('result',\n",
     "                             results=all_res,\n",
     "                             type_filter='multiclass', \n",
     "                             metadata=metadata,\n",
     "                             x_sort_by=tasks_sort_by,\n",
-    "                             ylabel=multiclass_score_label,\n",
+    "                             ylabel=multiclass_result_label,\n",
     "                             ylim=dict(top=0.1),\n",
     "                             hue_sort_by=frameworks_sort_key,\n",
     "                             ci=95,\n",
-    "                             title=f\"Scores ({multiclass_score_label}) on {results_group} multiclass classification problems{title_extra}\",\n",
+    "                             title=f\"Results ({multiclass_result_label}) on {results_group} multiclass classification problems{title_extra}\",\n",
     "                             legend_loc='lower center',\n",
     "                             legend_labels=frameworks_labels,\n",
     "                            );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_score_barplot.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_result_barplot.png\"))"
    ]
   },
   {
@@ -724,22 +703,22 @@
    "outputs": [],
    "source": [
     "if 'regression' in problem_types:\n",
-    "    fig = draw_score_barplot('score',\n",
+    "    fig = draw_score_barplot('result',\n",
     "                             results=all_res,\n",
     "                             type_filter='regression', \n",
     "                             metadata=metadata,\n",
     "                             x_sort_by=tasks_sort_by,\n",
     "                             yscale='symlog',\n",
-    "                             ylabel=regression_score_label,\n",
+    "                             ylabel=regression_result_label,\n",
     "                             ylim=dict(top=0.1),\n",
     "                             hue_sort_by=frameworks_sort_key, \n",
     "                             ci=95,\n",
-    "                             title=f\"Scores ({regression_score_label}) on {results_group} regression classification problems{title_extra}\",\n",
+    "                             title=f\"Results ({regression_result_label}) on {results_group} regression classification problems{title_extra}\",\n",
     "                             legend_loc='lower center',\n",
     "                             legend_labels=frameworks_labels,\n",
     "                             size=(8, 6),\n",
     "                            );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"regression_score_barplot.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"regression_result_barplot.png\"))"
    ]
   },
   {
@@ -756,20 +735,20 @@
    "outputs": [],
    "source": [
     "if 'binary' in problem_types:\n",
-    "    fig = draw_score_pointplot('score',\n",
+    "    fig = draw_score_pointplot('result',\n",
     "                               results=all_res,\n",
     "                               type_filter='binary', \n",
     "                               metadata=metadata,\n",
     "                               x_sort_by=tasks_sort_by,\n",
-    "                               ylabel=binary_score_label,\n",
+    "                               ylabel=binary_result_label,\n",
     "                               ylim=dict(bottom=.5),\n",
     "                               hue_sort_by=frameworks_sort_key,\n",
     "                               join='none', marker='hline_xspaced', ci=95, \n",
-    "                               title=f\"Scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}\",\n",
+    "                               title=f\"Results ({binary_result_label}) on {results_group} binary classification problems{title_extra}\",\n",
     "                               legend_loc='lower center',\n",
     "                               legend_labels=frameworks_labels,\n",
     "                              );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"binary_score_pointplot.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"binary_result_pointplot.png\"))"
    ]
   },
   {
@@ -779,19 +758,19 @@
    "outputs": [],
    "source": [
     "if 'multiclass' in problem_types:\n",
-    "    fig = draw_score_pointplot('score',\n",
+    "    fig = draw_score_pointplot('result',\n",
     "                               results=all_res,\n",
     "                               type_filter='multiclass', \n",
     "                               metadata=metadata,\n",
     "                               x_sort_by=tasks_sort_by,\n",
-    "                               ylabel=multiclass_score_label,\n",
+    "                               ylabel=multiclass_result_label,\n",
     "                               hue_sort_by=frameworks_sort_key,\n",
     "                               join='none', marker='hline_xspaced', ci=95, \n",
-    "                               title=f\"Scores ({multiclass_score_label}) on {results_group} multiclass classification problems{title_extra}\",\n",
+    "                               title=f\"Results ({multiclass_result_label}) on {results_group} multiclass classification problems{title_extra}\",\n",
     "                               legend_loc='lower center',\n",
     "                               legend_labels=frameworks_labels,\n",
     "                              );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_score_pointplot.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_result_pointplot.png\"))"
    ]
   },
   {
@@ -801,22 +780,22 @@
    "outputs": [],
    "source": [
     "if 'regression' in problem_types:\n",
-    "    fig = draw_score_pointplot('score',\n",
+    "    fig = draw_score_pointplot('result',\n",
     "                               results=all_res,\n",
     "                               type_filter='regression', \n",
     "                               metadata=metadata,\n",
     "                               x_sort_by=tasks_sort_by,\n",
-    "                               ylabel=regression_score_label,\n",
+    "                               ylabel=regression_result_label,\n",
     "                               yscale='symlog',\n",
     "                               ylim=dict(top=0.1),\n",
     "                               hue_sort_by=frameworks_sort_key,\n",
     "                               join='none', marker='hline_xspaced', ci=95, \n",
-    "                               title=f\"Scores ({regression_score_label}) on {results_group} regression classification problems{title_extra}\",\n",
+    "                               title=f\"Results ({regression_result_label}) on {results_group} regression classification problems{title_extra}\",\n",
     "                               legend_loc='lower center',\n",
     "                               legend_labels=frameworks_labels,\n",
     "                               size=(8, 6),\n",
     "                              );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"regression_score_pointplot.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"regression_result_pointplot.png\"))"
    ]
   },
   {
@@ -833,17 +812,17 @@
    "outputs": [],
    "source": [
     "if 'binary' in problem_types:\n",
-    "    fig = draw_score_stripplot('score', \n",
+    "    fig = draw_score_stripplot('result', \n",
     "                               results=all_res.sort_values(by=['framework']),\n",
     "                               type_filter='binary', \n",
     "                               metadata=metadata,\n",
-    "                               xlabel=binary_score_label,\n",
+    "                               xlabel=binary_result_label,\n",
     "                               y_sort_by=tasks_sort_by,\n",
     "                               hue_sort_by=frameworks_sort_key,\n",
-    "                               title=f\"Scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}\",\n",
+    "                               title=f\"Results ({binary_result_label}) on {results_group} binary classification problems{title_extra}\",\n",
     "                               legend_labels=frameworks_labels,\n",
     "                              );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"binary_score_stripplot.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"binary_result_stripplot.png\"))"
    ]
   },
   {
@@ -853,18 +832,18 @@
    "outputs": [],
    "source": [
     "if 'multiclass' in problem_types:\n",
-    "    fig = draw_score_stripplot('score', \n",
+    "    fig = draw_score_stripplot('result', \n",
     "                               results=all_res.sort_values(by=['framework']),\n",
     "                               type_filter='multiclass', \n",
     "                               metadata=metadata,\n",
-    "                               xlabel=multiclass_score_label,\n",
+    "                               xlabel=multiclass_result_label,\n",
     "                               xscale='symlog',\n",
     "                               y_sort_by=tasks_sort_by,\n",
     "                               hue_sort_by=frameworks_sort_key,\n",
-    "                               title=f\"Scores ({multiclass_score_label}) on {results_group} multi-class classification problems{title_extra}\",\n",
+    "                               title=f\"Results ({multiclass_result_label}) on {results_group} multi-class classification problems{title_extra}\",\n",
     "                               legend_labels=frameworks_labels,\n",
     "                              );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_score_stripplot.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_result_stripplot.png\"))"
    ]
   },
   {
@@ -874,18 +853,18 @@
    "outputs": [],
    "source": [
     "if 'regression' in problem_types:\n",
-    "    fig = draw_score_stripplot('score', \n",
+    "    fig = draw_score_stripplot('result', \n",
     "                               results=all_res,\n",
     "                               type_filter='regression', \n",
     "                               metadata=metadata,\n",
-    "                               xlabel=regression_score_label,\n",
+    "                               xlabel=regression_result_label,\n",
     "                               xscale='symlog',\n",
     "                               y_sort_by=tasks_sort_by,\n",
     "                               hue_sort_by=frameworks_sort_key,\n",
-    "                               title=f\"Scores ({regression_score_label}) on {results_group} regression problems{title_extra}\",\n",
+    "                               title=f\"Results ({regression_result_label}) on {results_group} regression problems{title_extra}\",\n",
     "                               legend_labels=frameworks_labels,\n",
     "                              );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"regression_score_stripplot.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"regression_result_stripplot.png\"))"
    ]
   },
   {
@@ -902,17 +881,17 @@
    "outputs": [],
    "source": [
     "if 'binary' in problem_types and normalization:\n",
-    "    fig = draw_score_stripplot('norm_score', \n",
+    "    fig = draw_score_stripplot('norm_result', \n",
     "                               results=all_res,\n",
     "                               type_filter='binary', \n",
     "                               metadata=metadata,\n",
-    "                               xlabel=f\"rel. {binary_score_label}\",\n",
+    "                               xlabel=f\"rel. {binary_result_label}\",\n",
     "                               y_sort_by='nrows',\n",
     "                               hue_sort_by=frameworks_sort_key,\n",
-    "                               title=f\"Relative scores ({binary_score_label}) on {results_group} binary classification problems{title_extra}\",\n",
+    "                               title=f\"Relative results ({binary_result_label}) on {results_group} binary classification problems{title_extra}\",\n",
     "                               legend_labels=frameworks_labels,\n",
     "                              );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"binary_rel_score_stripplot.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"binary_rel_result_stripplot.png\"))"
    ]
   },
   {
@@ -922,18 +901,18 @@
    "outputs": [],
    "source": [
     "if 'multiclass' in problem_types and normalization:\n",
-    "    fig = draw_score_stripplot('norm_score', \n",
+    "    fig = draw_score_stripplot('norm_result', \n",
     "                               results=all_res,\n",
     "                               type_filter='multiclass', \n",
     "                               metadata=metadata,\n",
-    "                               xlabel=f\"rel. {multiclass_score_label}\",\n",
+    "                               xlabel=f\"rel. {multiclass_result_label}\",\n",
     "                               xscale='symlog',\n",
     "                               y_sort_by='nrows',\n",
     "                               hue_sort_by=frameworks_sort_key,\n",
-    "                               title=f\"Relative scores ({multiclass_score_label}) on {results_group} multi-class classification problems{title_extra}\",\n",
+    "                               title=f\"Relative results ({multiclass_result_label}) on {results_group} multi-class classification problems{title_extra}\",\n",
     "                               legend_labels=frameworks_labels,\n",
     "                              );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_rel_score_stripplot.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"multiclass_rel_result_stripplot.png\"))"
    ]
   },
   {
@@ -943,17 +922,17 @@
    "outputs": [],
    "source": [
     "if 'regression' in problem_types and normalization:\n",
-    "    fig = draw_score_stripplot('norm_score', \n",
+    "    fig = draw_score_stripplot('norm_result', \n",
     "                               results=all_res,\n",
     "                               type_filter='regression', \n",
     "                               metadata=metadata,\n",
-    "                               xlabel=f\"rel. {regression_score_label}\",\n",
+    "                               xlabel=f\"rel. {regression_result_label}\",\n",
     "                               y_sort_by='nrows',\n",
     "                               hue_sort_by=frameworks_sort_key,\n",
-    "                               title=f\"Relative scores ({regression_score_label}) on {results_group} regression problems{title_extra}\",\n",
+    "                               title=f\"Relative results ({regression_result_label}) on {results_group} regression problems{title_extra}\",\n",
     "                               legend_labels=frameworks_labels,\n",
     "                              );\n",
-    "    savefig(fig, create_file(output_dir, \"visualizations\", \"regression_rel_score_stripplot.png\"))"
+    "    savefig(fig, create_file(output_dir, \"visualizations\", \"regression_rel_result_stripplot.png\"))"
    ]
   },
   {
diff --git a/resources/config.yaml b/resources/config.yaml
index e217613a7..a1dfbf1ec 100644
--- a/resources/config.yaml
+++ b/resources/config.yaml
@@ -50,10 +50,12 @@ benchmarks:
   os_mem_size_mb: 2048        # the default amount of memory left to the OS when task assigned memory is computed automatically.
   os_vol_size_mb: 2048        # the default amount of volume left to the OS when task volume memory is verified.
   overhead_time_seconds: 3600   # amount of additional time allowed for the job to complete before sending an interruption signal
-  metrics:                    # default metrics by dataset type (as listed by amlb.data.DatasetType), only the first metric is optimized by the frameworks, the others are computed only for information purpose.
-    binary: ['auc', 'logloss', 'acc', 'balacc']
-    multiclass: ['logloss', 'acc', 'balacc']
-    regression: ['rmse', 'r2', 'mae']
+  metrics:                    # default metrics by dataset type (as listed by amlb.data.DatasetType),
+                              # only the first metric is optimized by the frameworks,
+                              # the others are computed only for information purpose.
+    binary: ['auc', 'logloss', 'acc', 'balacc']     # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error).
+    multiclass: ['logloss', 'acc', 'balacc']        # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average.
+    regression: ['rmse', 'r2', 'mae']               # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2).
   defaults:
     folds: 10
     max_runtime_seconds: 3600