Add implementation of precision-recall gain AUC (#21370)

iazzi · web-flow · commit 08ad93b77488 · 2025-06-10T12:13:50.000-07:00
* Remove dead code in the confusion metrics * Add PRGAIN enum for AUC metric types * Add implementation of precision-recall gain AUC Based on https://research-information.bris.ac.uk/files/72164009/5867_precision_recall_gain_curves_pr_analysis_done_right.pdf
diff --git a/keras/src/metrics/confusion_metrics.py b/keras/src/metrics/confusion_metrics.py
@@ -1346,25 +1346,6 @@ def update_state(self, y_true, y_pred, sample_weight=None):
         if not self._built:
             self._build(y_pred.shape)
 
-        if self.multi_label or (self.label_weights is not None):
-            # y_true should have shape (number of examples, number of labels).
-            shapes = [(y_true, ("N", "L"))]
-            if self.multi_label:
-                # TP, TN, FP, and FN should all have shape
-                # (number of thresholds, number of labels).
-                shapes.extend(
-                    [
-                        (self.true_positives, ("T", "L")),
-                        (self.true_negatives, ("T", "L")),
-                        (self.false_positives, ("T", "L")),
-                        (self.false_negatives, ("T", "L")),
-                    ]
-                )
-            if self.label_weights is not None:
-                # label_weights should be of length equal to the number of
-                # labels.
-                shapes.append((self.label_weights, ("L",)))
-
         # Only forward label_weights to update_confusion_matrix_variables when
         # multi_label is False. Otherwise the averaging of individual label AUCs
         # is handled in AUC.result
@@ -1500,13 +1481,53 @@ def result(self):
             )
             x = fp_rate
             y = recall
-        else:  # curve == 'PR'.
+        elif self.curve == metrics_utils.AUCCurve.PR:  # curve == 'PR'.
             precision = ops.divide_no_nan(
                 self.true_positives,
                 ops.add(self.true_positives, self.false_positives),
             )
             x = recall
             y = precision
+        else:  # curve == 'PRGAIN'.
+            # Due to the hyperbolic transform, this formula is less robust than
+            # ROC and PR values. In particular
+            # 1) Both measures diverge when there are no negative values;
+            # 2) Both measures diverge when there are no true positives;
+            # 3) Recall gain becomes negative when the recall is lower than the
+            #    label average (i.e. when more negative exampless are
+            #    classified positive than real positives).
+            #
+            # We ignore case 1 as it is easily understood that metrics would be
+            # badly defined then. For case 2 we set recall_gain to 0 and
+            # precision_gain to 1. For case 3 we set recall_gain to 0. These
+            # fixes will result in an overstimation of the AUCfor estimators
+            # that are anti-correlated with the label (at some threshold).
+
+            # The scaling factor $\frac{P}{N}$ that is used to for mboth gain
+            # values.
+            scaling_factor = ops.divide_no_nan(
+                ops.add(self.true_positives, self.false_negatives),
+                ops.add(self.true_negatives, self.false_positives),
+            )
+
+            recall_gain = 1.0 - scaling_factor * ops.divide_no_nan(
+                self.false_negatives, self.true_positives
+            )
+            precision_gain = 1.0 - scaling_factor * ops.divide_no_nan(
+                self.false_positives, self.true_positives
+            )
+            # Handle case 2.
+            recall_gain = ops.where(
+                ops.equal(self.true_positives, 0.0), 0.0, recall_gain
+            )
+            precision_gain = ops.where(
+                ops.equal(self.true_positives, 0.0), 1.0, precision_gain
+            )
+            # Handle case 3.
+            recall_gain = ops.maximum(recall_gain, 0.0)
+
+            x = recall_gain
+            y = precision_gain
 
         # Find the rectangle heights based on `summation_method`.
         if (
diff --git a/keras/src/metrics/confusion_metrics_test.py b/keras/src/metrics/confusion_metrics_test.py
@@ -1396,6 +1396,79 @@ def test_weighted_pr_interpolation_negative_weights(self):
         # produce all zeros.
         self.assertAllClose(result, 0.0, 1e-3)
 
+    def test_weighted_prgain_majoring(self):
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds,
+            curve="PRGAIN",
+            summation_method="majoring",
+        )
+        result = auc_obj(
+            self.y_true, self.y_pred, sample_weight=self.sample_weight
+        )
+
+        # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+        # scaling_facor (P/N) = 7/3
+        # recall_gain = 1 - 7/3 [0/7, 3/4, 7/0] = [1, -3/4, -inf] -> [1, 0, 0]
+        # precision_gain = 1 - 7/3 [3/7, 0/4, 0/0] = [0, 1, NaN] -> [0, 1, 1]
+        # heights = [max(0, 1), max(1, 1)] = [1, 1]
+        # widths = [(1 - 0), (0 - 0)] = [1, 0]
+        expected_result = 1 * 1 + 0 * 1
+        self.assertAllClose(result, expected_result, 1e-3)
+
+    def test_weighted_prgain_minoring(self):
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds,
+            curve="PRGAIN",
+            summation_method="minoring",
+        )
+        result = auc_obj(
+            self.y_true, self.y_pred, sample_weight=self.sample_weight
+        )
+
+        # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+        # scaling_facor (P/N) = 7/3
+        # recall_gain = 1 - 7/3 [0/7, 3/4, 7/0] = [1, -3/4, -inf] -> [1, 0, 0]
+        # precision_gain = 1 - 7/3 [3/7, 0/4, 0/0] = [0, 1, NaN] -> [0, 1, 1]
+        # heights = [min(0, 1), min(1, 1)] = [0, 1]
+        # widths = [(1 - 0), (0 - 0)] = [1, 0]
+        expected_result = 1 * 0 + 0 * 1
+        self.assertAllClose(result, expected_result, 1e-3)
+
+    def test_weighted_prgain_interpolation(self):
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds, curve="PRGAIN"
+        )
+        result = auc_obj(
+            self.y_true, self.y_pred, sample_weight=self.sample_weight
+        )
+
+        # tp = [7, 4, 0], fp = [3, 0, 0], fn = [0, 3, 7], tn = [0, 3, 3]
+        # scaling_facor (P/N) = 7/3
+        # recall_gain = 1 - 7/3 [0/7, 3/4, 7/0] = [1, -3/4, -inf] -> [1, 0, 0]
+        # precision_gain = 1 - 7/3 [3/7, 0/4, 0/0] = [0, 1, NaN] -> [0, 1, 1]
+        # heights = [(0+1)/2, (1+1)/2] = [0.5, 1]
+        # widths = [(1 - 0), (0 - 0)] = [1, 0]
+        expected_result = 1 * 0.5 + 0 * 1
+        self.assertAllClose(result, expected_result, 1e-3)
+
+    def test_prgain_interpolation(self):
+        auc_obj = metrics.AUC(
+            num_thresholds=self.num_thresholds, curve="PRGAIN"
+        )
+
+        y_true = np.array([0, 0, 0, 1, 0, 1, 0, 1, 1, 1])
+        y_pred = np.array([0.1, 0.2, 0.3, 0.3, 0.4, 0.4, 0.6, 0.6, 0.8, 0.9])
+        result = auc_obj(y_true, y_pred)
+
+        # tp = [5, 3, 0], fp = [5, 1, 0], fn = [0, 2, 5], tn = [0, 4, 4]
+        # scaling_facor (P/N) = 5/5 = 1
+        # recall_gain = 1 - [0/5, 2/3, 5/0] = [1, 1/3, -inf] -> [1, 1/3, 0]
+        # precision_gain = 1 - [5/5, 1/3, 0/0] = [1, 1/3, NaN] -> [0, 2/3, 1]
+        # heights = [(0+2/3)/2, (2/3+1)/2] = [0.333333, 0.833333]
+        # widths = [(1 - 1/3), (1/3 - 0)] = [0.666666, 0.333333]
+        expected_result = 0.666666 * 0.333333 + 0.333333 * 0.833333
+        self.assertAllClose(result, expected_result, 1e-3)
+
     def test_invalid_num_thresholds(self):
         with self.assertRaisesRegex(
             ValueError, "Argument `num_thresholds` must be an integer > 1"
diff --git a/keras/src/metrics/metrics_utils.py b/keras/src/metrics/metrics_utils.py
@@ -43,17 +43,20 @@ class AUCCurve(Enum):
 
     ROC = "ROC"
     PR = "PR"
+    PRGAIN = "PRGAIN"
 
     @staticmethod
     def from_str(key):
         if key in ("pr", "PR"):
             return AUCCurve.PR
         elif key in ("roc", "ROC"):
             return AUCCurve.ROC
+        elif key in ("prgain", "PRGAIN"):
+            return AUCCurve.PRGAIN
         else:
             raise ValueError(
                 f'Invalid AUC curve value: "{key}". '
-                'Expected values are ["PR", "ROC"]'
+                'Expected values are ["PR", "ROC", "PRGAIN"]'
             )