Add AUROC and AUPRC metrics for binary classification tasks (#244)

ianbulovic · web-flow · commit befe416b3196 · 2026-03-25T07:08:48.000-07:00
* add auroc metric

* add auprc metric

* update metric_for_best_model help message
diff --git a/src/cnlpt/_cli/train.py b/src/cnlpt/_cli/train.py
@@ -336,7 +336,6 @@ def transformers_arg_option(field_name: str, *args, **kwargs):
 FinalTaskWeightArg = Annotated[float, training_arg_option("final_task_weight")]
 FreezeEncoderArg = Annotated[float, training_arg_option("freeze_encoder")]
 BiasFitArg = Annotated[bool, training_arg_option("bias_fit")]
-ReportProbsArg = Annotated[bool, training_arg_option("report_probs")]
 EvalsPerEpochArg = Annotated[int, training_arg_option("evals_per_epoch")]
 RichDisplayArg = Annotated[bool, training_arg_option("rich_display")]
 LoggingStrategyArg = Annotated[
@@ -415,7 +414,6 @@ def train(
     final_task_weight: FinalTaskWeightArg = 1.0,
     freeze_encoder: FreezeEncoderArg = 0.0,
     bias_fit: BiasFitArg = False,
-    report_probs: ReportProbsArg = False,
     evals_per_epoch: EvalsPerEpochArg = 0,
     rich_display: RichDisplayArg = True,
     logging_strategy: LoggingStrategyArg = IntervalStrategy.EPOCH,
@@ -537,7 +535,6 @@ def train(
                 final_task_weight=final_task_weight,
                 freeze_encoder=freeze_encoder,
                 bias_fit=bias_fit,
-                report_probs=report_probs,
                 evals_per_epoch=evals_per_epoch,
                 rich_display=rich_display,
                 logging_strategy=logging_strategy,
diff --git a/src/cnlpt/train_system/args.py b/src/cnlpt/train_system/args.py
@@ -47,12 +47,6 @@ def __post_init__(self):
             "help": "Only optimize the bias parameters of the encoder (and the weights of the classifier heads), as proposed in the BitFit paper by Ben Zaken et al. 2021 (https://arxiv.org/abs/2106.10199)."
         },
     )
-    report_probs: bool = field(
-        default=False,
-        metadata={
-            "help": "If selected, probability scores will be added to the output prediction file for test data when used with --do_predict."
-        },
-    )
     evals_per_epoch: int = field(
         default=0,
         metadata={
@@ -85,6 +79,6 @@ def __post_init__(self):
     metric_for_best_model: Union[str, None] = field(
         default="avg_macro_f1",
         metadata={
-            "help": 'The metric to use to compare two different models. Average across tasks with "avg_[acc|macro_f1|micro_f1]". Optimize for a specific task with "taskname.[acc|macro_f1|micro_f1]". Optimize for a particular label with "taskname.labelname.f1". Average multiple metrics with "METRIC_1,METRIC_2".'
+            "help": 'The metric to use to compare two different models. Average across tasks with "avg_[acc|macro_f1|micro_f1]". Optimize for a specific task with "taskname.[acc|macro_f1|micro_f1]". Optimize for a particular label with "taskname.labelname.f1". For binary classification tasks, optimize for AUROC with "taskname.auroc" or for AUPRC with "taskname.labelname.auprc". Average multiple metrics with "METRIC_1,METRIC_2".'
         },
     )
diff --git a/src/cnlpt/train_system/cnlp_train_system.py b/src/cnlpt/train_system/cnlp_train_system.py
@@ -85,11 +85,7 @@ def _extract_task_predictions(self, p: EvalPrediction):
                 preds = np.argmax(raw_preds, axis=3)
             else:
                 preds = np.argmax(raw_preds, axis=1)
-                if self.args.report_probs:
-                    probs = np.max(
-                        [simple_softmax(logits) for logits in raw_preds],
-                        axis=1,
-                    )
+                probs = np.array([simple_softmax(logits) for logits in raw_preds])
 
             labels: Union[npt.NDArray[np.int64], None]
             task_label_width = 0
diff --git a/src/cnlpt/train_system/metrics.py b/src/cnlpt/train_system/metrics.py
@@ -2,7 +2,13 @@
 from typing import Union
 
 import numpy as np
-from sklearn.metrics import classification_report
+from sklearn.metrics import (
+    average_precision_score,
+    classification_report,
+    roc_auc_score,
+)
+
+from cnlpt.data import CLASSIFICATION
 
 from ..data.preprocess import MASK_VALUE
 from ..data.task_info import TaskInfo
@@ -48,4 +54,17 @@ def compute_metrics(self) -> dict[str, float]:
             **{f"{label}.f1": report[label]["f1-score"] for label in self.task.labels},
         }
 
+        if (
+            self.task.type == CLASSIFICATION
+            and len(self.task.labels) == 2
+            and self.probs is not None
+        ):
+            task_metrics["auroc"] = roc_auc_score(labels, self.probs[pred_inds[0], 1])
+            for label in self.task.labels:
+                task_metrics[f"{label}.auprc"] = average_precision_score(
+                    labels,
+                    self.probs[pred_inds[0], 1],
+                    pos_label=self.task.get_label_id(label),
+                )
+
         return {f"{self.task.name}.{key}": val for key, val in task_metrics.items()}