added testing for custom metric + custom balanced acc implementation

aditya0by0 · aditya0by0 · commit a4c970dddada · 2024-05-12T14:44:36.000+02:00
diff --git a/chebai/callbacks/epoch_metrics.py b/chebai/callbacks/epoch_metrics.py
@@ -47,3 +47,61 @@ def compute(self):
         # if (precision and recall are 0) or (precision is nan), set f1 to 0
         classwise_f1 = classwise_f1.nan_to_num()
         return torch.mean(classwise_f1)
+
+class BalancedAccuracy(torchmetrics.Metric):
+    def __init__(self, num_labels, dist_sync_on_step=False, threshold=0.5):
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+
+        self.add_state(
+            "true_positives",
+            default=torch.zeros(num_labels, dtype=torch.int),
+            dist_reduce_fx="sum",
+        )
+
+        self.add_state(
+            "false_positives",
+            default=torch.zeros(num_labels, dtype=torch.int),
+            dist_reduce_fx="sum",
+        )
+
+        self.add_state(
+            "true_negatives",
+            default=torch.zeros(num_labels, dtype=torch.int),
+            dist_reduce_fx="sum",
+        )
+
+        self.add_state(
+            "false_negatives",
+            default=torch.zeros(num_labels, dtype=torch.int),
+            dist_reduce_fx="sum",
+        )
+
+        self.threshold = threshold
+
+    def update(self, preds: torch.Tensor, labels: torch.Tensor):
+        """"Update the TPs, TNs ,FPs and FNs """
+
+        # Size: Batch_size x Num_of_Classes;
+        # summing over 1st dimension (dim=0), gives us the True positives per class
+        tps = torch.sum(torch.logical_and(preds > self.threshold, labels.to(torch.bool)), dim=0)
+        fps = torch.sum(torch.logical_and(preds > self.threshold, ~labels.to(torch.bool)), dim=0)
+        tns = torch.sum(torch.logical_and(preds <= self.threshold, ~labels.to(torch.bool)), dim=0)
+        fns = torch.sum(torch.logical_and(preds <= self.threshold, labels.to(torch.bool)), dim=0)
+
+        # Size: Num_of_Classes;
+        self.true_positives += tps
+        self.false_positives += fps
+        self.true_negatives += tns
+        self.false_negatives += fns
+
+    def compute(self):
+        """Compute the average value of Balanced accuracy from each batch"""
+
+        tpr = self.true_positives / (self.true_positives + self.false_negatives)
+        tnr = self.true_negatives / (self.true_negatives + self.false_positives)
+        # Convert the nan values to 0
+        tpr = tpr.nan_to_num()
+        tnr = tnr.nan_to_num()
+
+        balanced_acc = (tpr + tnr) / 2
+        return torch.mean(balanced_acc)
diff --git a/chebai/cli.py b/chebai/cli.py
@@ -11,10 +11,10 @@ def __init__(self, *args, **kwargs):
 
     def add_arguments_to_parser(self, parser: LightningArgumentParser):
         for kind in ("train", "val", "test"):
-            for average in ("micro", "macro"):
+            for average in ("micro-f1", "macro-f1", "balanced-accuracy"):
                 parser.link_arguments(
                     "model.init_args.out_dim",
-                    f"model.init_args.{kind}_metrics.init_args.metrics.{average}-f1.init_args.num_labels",
+                    f"model.init_args.{kind}_metrics.init_args.metrics.{average}.init_args.num_labels",
                 )
         parser.link_arguments(
             "model.init_args.out_dim", "trainer.callbacks.init_args.num_labels"
diff --git a/configs/metrics/balanced-accuracy.yml b/configs/metrics/balanced-accuracy.yml
@@ -0,0 +1,5 @@
+class_path: torchmetrics.MetricCollection
+init_args:
+  metrics:
+    balanced-accuracy:
+      class_path: chebai.callbacks.epoch_metrics.BalancedAccuracy
diff --git a/setup.py b/setup.py
@@ -49,6 +49,7 @@
         "wandb",
         "chardet",
         "yaml",
+        "torchmetrics",
     ],
     extras_require={"dev": ["black", "isort", "pre-commit"]},
 )
diff --git a/tests/testCustomBalancedAccuracyMetric.py b/tests/testCustomBalancedAccuracyMetric.py
@@ -0,0 +1,120 @@
+import unittest
+import torch
+import os
+from chebai.callbacks.epoch_metrics import BalancedAccuracy
+import random
+
+
+class TestCustomMacroF1Metric(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    def test_iterative_vs_single_call_approach(self):
+        """Test the custom metric implementation in update fashion approach against
+        the single call approach"""
+
+        preds = torch.tensor([[1, 1, 0, 1],
+                              [1, 0, 1, 1],
+                              [0, 1, 0, 1]])
+        label = torch.tensor([[0, 0, 0, 0],
+                              [0, 0, 1, 1],
+                              [0, 1, 0, 1]])
+
+        num_labels = label.shape[1]
+        iterative_custom_metric = BalancedAccuracy(num_labels=num_labels)
+        for i in range(label.shape[0]):
+            iterative_custom_metric.update(preds[i].unsqueeze(0), label[i].unsqueeze(0))
+        iterative_custom_metric_score = iterative_custom_metric.compute().item()
+
+        single_call_custom_metric = BalancedAccuracy(num_labels=num_labels)
+        single_call_custom_metric_score = single_call_custom_metric(preds, label).item()
+
+        self.assertEqual(iterative_custom_metric_score, single_call_custom_metric_score)
+
+    def test_metric_against_realistic_data(self):
+        """Test the custom metric against the standard on realistic data"""
+        directory_path = "CheBIOver100_test"
+        abs_path = os.path.join(os.getcwd(), directory_path)
+        print(f"Checking data from - {abs_path}")
+        num_of_files = len(os.listdir(abs_path)) // 2
+
+        # load single file to get the num of labels for metric class instantiation
+        labels = torch.load(f'{directory_path}/labels{0:03d}.pt', map_location=torch.device(self.device))
+        num_labels = labels.shape[1]
+        balanced_acc_custom = BalancedAccuracy(num_labels=num_labels)
+
+        for i in range(num_of_files):
+            labels = torch.load(f'{directory_path}/labels{i:03d}.pt', map_location=torch.device(self.device))
+            preds = torch.load(f'{directory_path}/preds{i:03d}.pt', map_location=torch.device(self.device))
+            balanced_acc_custom.update(preds, labels)
+
+        balanced_acc_custom_score = balanced_acc_custom.compute().item()
+        print(f"Balanced Accuracy for realistic data: {balanced_acc_custom_score}")
+
+    def test_case_when_few_class_has_no_labels(self):
+        """Test custom metric against standard metric for the scenario where some class has no labels"""
+        preds = torch.tensor([[1, 1, 0, 1],
+                              [1, 0, 1, 1],
+                              [0, 1, 0, 1]])
+        label = torch.tensor([[0, 0, 0, 0], # no labels
+                              [0, 0, 1, 1],
+                              [0, 1, 0, 1]])
+
+        # tp = [0, 1, 1, 2], fp = [2, 1, 0, 1], tn = [1, 1, 2, 0], fn = [0, 0, 0, 0]
+        # tpr = [0, 1, 1, 2] / ([0, 1, 1, 2] + [0, 0, 0, 0]) = [0, 1, 1, 1]
+        # tnr = [1, 1, 2, 0] / ([1, 1, 2, 0] + [2, 1, 0, 1]) = [0.33333, 0.5, 1, 0]
+        # balanced_accuracy = ([0, 1, 1, 1] + [0.33333, 0.5, 1, 0]) / 2 = ([0.16666667, 0.75, 1, 0.5]
+        # mean bal accuracy = 0.6041666666666666
+
+        balanced_acc_score = self.__get_custom_metric_score(preds, label, label.shape[1])
+
+        self.assertAlmostEqual(balanced_acc_score, 0.6041666666, places=4)
+
+    def test_all_predictions_are_1_half_labels_are_1(self):
+        """Test custom metric against standard metric for the scenario where all prediction are 1 but only half of
+        the labels are 1"""
+        preds = torch.ones((1, 900), dtype=torch.int)
+        label = torch.ones((1, 900), dtype=torch.int)
+
+        mask = [[True] * (label.size(1) // 2) + [False] * (label.size(1) - (label.size(1) // 2))]
+        random.shuffle(mask[0])
+        label[torch.tensor(mask)] = 0
+
+        # preds = [1, 1, 1, 1], label = [0, 1, 0, 1]
+        # tp = [0, 1, 0, 1], fp = [1, 0, 1, 0], tn = [0, 0, 0, 0], fn = [0, 0, 0, 0]
+        # tpr = tp / (tp + fn) = [0, 1, 0, 1] / [0, 1, 0, 1] = [0, 1, 0, 1]
+        # tnr = tn / (tn + fp) = [0, 0, 0, 0]
+        # balanced accuracy = 1 / 4 = 0.25
+
+        balanced_acc_custom_score = self.__get_custom_metric_score(preds, label, label.shape[1])
+        self.assertAlmostEqual(balanced_acc_custom_score, 0.25, places=4)
+
+    def test_all_labels_are_1_half_predictions_are_1(self):
+        """Test custom metric against standard metric for the scenario where all labels are 1 but only half of
+        the predictions are 1"""
+        preds = torch.ones((1, 900), dtype=torch.int)
+        label = torch.ones((1, 900), dtype=torch.int)
+
+        mask = [[True] * (label.size(1) // 2) + [False] * (label.size(1) - (label.size(1) // 2))]
+        random.shuffle(mask[0])
+        preds[torch.tensor(mask)] = 0
+
+        # label = [1, 1, 1, 1], pred = [0, 1, 0, 1]
+        # tp = [0, 1, 0, 1], fp = [0, 1, 0, 1], tn = [0, 0, 0, 0], fn = [0, 0, 0, 0]
+        # tpr = tp / (tp + fn) = [0, 1, 0, 1] / [0, 1, 0, 1] = [0, 1, 0, 1]
+        # tnr = tn / (tn + fp) = [0, 0, 0, 0]
+        # balanced accuracy = 1 / 4 = 0.25
+
+        balanced_acc_custom_score = self.__get_custom_metric_score(preds, label, label.shape[1])
+        self.assertAlmostEqual(balanced_acc_custom_score, 0.25, places=4)
+
+    @staticmethod
+    def __get_custom_metric_score(preds, labels, num_labels):
+        balanced_acc_custom = BalancedAccuracy(num_labels=num_labels)
+        return balanced_acc_custom(preds, labels).item()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/testCustomMacroF1Metric.py b/tests/testCustomMacroF1Metric.py
@@ -0,0 +1,160 @@
+import unittest
+import torch
+import os
+from chebai.callbacks.epoch_metrics import MacroF1
+from torchmetrics.classification import MultilabelF1Score
+import random
+
+
+class TestCustomMacroF1Metric(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls) -> None:
+        cls.device = "cuda" if torch.cuda.is_available() else "cpu"
+
+    @unittest.expectedFailure
+    def test_all_predictions_are_1_half_labels_are_1(self):
+        """Test custom metric against standard metric for the scenario where all prediction are 1 but only half of
+        the labels are 1"""
+        preds = torch.ones((1, 900), dtype=torch.int)
+        label = torch.ones((1, 900), dtype=torch.int)
+
+        mask = [
+            [True] * (label.size(1) // 2)
+            + [False] * (label.size(1) - (label.size(1) // 2))
+        ]
+        random.shuffle(mask[0])
+        label[torch.tensor(mask)] = 0
+
+        macro_f1_custom_score, macro_f1_standard_score = (
+            self.__get_custom_and_standard_metric_scores(label.shape[1], preds, label)
+        )
+
+        # preds = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
+        # label = torch.tensor([[1, 1, 0, 0, 1, 1, 0, 0, 1, 0]])
+        # tps = [1, 1, 0, 0, 1, 1, 0, 0, 1, 0]
+        # positive_predictions = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+        # positive_labels = [1, 1, 0, 0, 1, 1, 0, 0, 1, 0]
+
+        # ---------------------- For Standard F1 Macro Metric ---------------------
+        # The metric is only proper defined when TP + FP ≠ 0 ∧ TP + FN ≠ 0
+        # If this case is encountered for any class/label, the metric for that class/label
+        # will be set to 0 and the overall metric may therefore be affected in turn.
+
+        # precision = [1, 1, 0, 0, 1, 1, 0, 0, 1, 0]
+        # recall    = [1, 1, 0, 0, 1, 1, 0, 0, 1, 0]
+        # classwise_f1 = [2, 2, 0, 0, 2, 2, 0, 0, 2, 0] / [2, 2, 0, 0, 2, 2, 0, 0, 2, 0]
+        #              = [1, 1, 0, 0, 1, 1, 0, 0, 1, 0]
+        # mean = 5/10 = 0.5
+
+        # ----------------------- For Custom F1 Metric ----------------------------
+        # Perform masking as first step to take only class with positive labels
+        # mask = [True, True, False, False, True, True, False, False, True, False]
+        # precision = [1, 1, 1, 1, 1] / [1, 1, 1, 1, 1] = [1, 1, 1, 1, 1]
+        # recall    = [1, 1, 1, 1, 1] / [1, 1, 1, 1, 1] = [1, 1, 1, 1, 1]
+        # classwise_f1 = [2, 2, 2, 2, 2] / [2, 2, 2, 2, 2] = [1, 1, 1, 1, 1]
+        # mean = 5/5 = 1  (because of masking we averaging with across positive labels only)
+
+        self.assertAlmostEqual(macro_f1_custom_score, macro_f1_standard_score, places=4)
+
+    def test_all_labels_are_1_half_predictions_are_1(self):
+        """Test custom metric against standard metric for the scenario where all labels are 1 but only half of
+        the predictions are 1"""
+        preds = torch.ones((1, 900), dtype=torch.int)
+        label = torch.ones((1, 900), dtype=torch.int)
+
+        mask = [
+            [True] * (label.size(1) // 2)
+            + [False] * (label.size(1) - (label.size(1) // 2))
+        ]
+        random.shuffle(mask[0])
+        preds[torch.tensor(mask)] = 0
+
+        macro_f1_custom_score, macro_f1_standard_score = (
+            self.__get_custom_and_standard_metric_scores(label.shape[1], preds, label)
+        )
+
+        # As we are only taking positive labels for custom metric calculation via masking,
+        # and since all labels are positive in this scenario, custom and std metric are same
+        self.assertAlmostEqual(macro_f1_custom_score, macro_f1_standard_score, places=4)
+
+    def test_iterative_vs_single_call_approach(self):
+        """Test the custom metric implementation in update fashion approach against
+        the single call approach"""
+        preds = torch.tensor([[1, 1, 0, 1], [1, 0, 1, 1], [0, 1, 0, 1]])
+        label = torch.tensor([[0, 0, 0, 0], [0, 0, 1, 1], [0, 1, 0, 1]])
+
+        num_labels = label.shape[1]
+        iterative_custom_metric = MacroF1(num_labels=num_labels)
+        for i in range(label.shape[0]):
+            iterative_custom_metric.update(preds[i].unsqueeze(0), label[i].unsqueeze(0))
+        iterative_custom_metric_score = iterative_custom_metric.compute().item()
+
+        single_call_custom_metric = MacroF1(num_labels=num_labels)
+        single_call_custom_metric_score = single_call_custom_metric(preds, label).item()
+
+        self.assertEqual(iterative_custom_metric_score, single_call_custom_metric_score)
+
+    @unittest.expectedFailure
+    def test_metric_against_realistic_data(self):
+        """Test the custom metric against the standard on realistic data"""
+        directory_path = "CheBIOver100_test"
+        abs_path = os.path.join(os.getcwd(), directory_path)
+        print(f"Checking data from - {abs_path}")
+        num_of_files = len(os.listdir(abs_path)) // 2
+
+        # load single file to get the num of labels for metric class instantiation
+        labels = torch.load(
+            f"{directory_path}/labels{0:03d}.pt", map_location=torch.device(self.device)
+        )
+        num_labels = labels.shape[1]
+        macro_f1_custom = MacroF1(num_labels=num_labels)
+        macro_f1_standard = MultilabelF1Score(num_labels=num_labels, average="macro")
+
+        # load each file in the directory and update the stats
+        for i in range(num_of_files):
+            labels = torch.load(
+                f"{directory_path}/labels{i:03d}.pt",
+                map_location=torch.device(self.device),
+            )
+            preds = torch.load(
+                f"{directory_path}/preds{i:03d}.pt",
+                map_location=torch.device(self.device),
+            )
+            macro_f1_standard.update(preds, labels)
+            macro_f1_custom.update(preds, labels)
+
+        macro_f1_custom_score = macro_f1_custom.compute().item()
+        macro_f1_standard_score = macro_f1_standard.compute().item()
+        print(
+            f"Realistic Data - Custom F1 score: {macro_f1_custom_score}, Std. F1 score: {macro_f1_standard_score}"
+        )
+
+        self.assertAlmostEqual(macro_f1_custom_score, macro_f1_standard_score, places=4)
+
+    @unittest.expectedFailure
+    def test_case_when_few_class_has_no_labels(self):
+        """Test custom metric against standard metric for the scenario where some class has no labels"""
+        preds = torch.tensor([[1, 1, 0, 1], [1, 0, 1, 1], [0, 1, 0, 1]])
+        label = torch.tensor([[0, 0, 0, 0], [0, 0, 1, 1], [0, 1, 0, 1]])
+        macro_f1_custom_score, macro_f1_standard_score = (
+            self.__get_custom_and_standard_metric_scores(label.shape[1], preds, label)
+        )
+
+        self.assertAlmostEqual(macro_f1_custom_score, macro_f1_standard_score, places=4)
+
+    @staticmethod
+    def __get_custom_and_standard_metric_scores(num_labels, preds, labels):
+        # Custom metric score
+        macro_f1_custom = MacroF1(num_labels=num_labels)
+        macro_f1_custom_score = macro_f1_custom(preds, labels).item()
+
+        # Standard metric score
+        macro_f1_standard = MultilabelF1Score(num_labels=num_labels, average="macro")
+        macro_f1_standard_score = macro_f1_standard(preds, labels).item()
+
+        return macro_f1_custom_score, macro_f1_standard_score
+
+
+if __name__ == "__main__":
+    unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -49,6 +49,7 @@`
`49`	`49`	`"wandb",`
`50`	`50`	`"chardet",`
`51`	`51`	`"yaml",`
	`52`	`+ "torchmetrics",`
`52`	`53`	`],`
`53`	`54`	`extras_require={"dev": ["black", "isort", "pre-commit"]},`
`54`	`55`	`)`