Update testCustomMacroF1Metric.py

aditya0by0 · aditya0by0 · commit 3ce690c8cce9 · 2024-05-20T10:22:57.000+02:00
diff --git a/tests/testCustomMacroF1Metric.py b/tests/testCustomMacroF1Metric.py
@@ -96,7 +96,6 @@ def test_iterative_vs_single_call_approach(self):
 
         self.assertEqual(iterative_custom_metric_score, single_call_custom_metric_score)
 
-    @unittest.expectedFailure
     def test_metric_against_realistic_data(self):
         """Test the custom metric against the standard on realistic data"""
         directory_path = os.path.join("tests", "test_data", "CheBIOver100_test")
@@ -131,9 +130,10 @@ def test_metric_against_realistic_data(self):
             f"Realistic Data - Custom F1 score: {macro_f1_custom_score}, Std. F1 score: {macro_f1_standard_score}"
         )
 
-        self.assertAlmostEqual(macro_f1_custom_score, macro_f1_standard_score, places=4)
+        self.assertNotAlmostEqual(
+            macro_f1_custom_score, macro_f1_standard_score, places=4
+        )
 
-    @unittest.expectedFailure
     def test_case_when_few_class_has_no_labels(self):
         """Test custom metric against standard metric for the scenario where some class has no labels"""
         preds = torch.tensor([[1, 1, 0, 1], [1, 0, 1, 1], [0, 1, 0, 1]])
@@ -142,7 +142,32 @@ def test_case_when_few_class_has_no_labels(self):
             self.__get_custom_and_standard_metric_scores(label.shape[1], preds, label)
         )
 
-        self.assertAlmostEqual(macro_f1_custom_score, macro_f1_standard_score, places=4)
+        # tps = [0, 1, 1, 2]
+        # positive_predictions = [2, 2, 1, 3]
+        # positive_labels = [0, 1, 1, 2]
+
+        # ---------------------- For Standard F1 Macro Metric ---------------------
+        # The metric is only proper defined when TP + FP ≠ 0 ∧ TP + FN ≠ 0
+        # If this case is encountered for any class/label, the metric for that class/label
+        # will be set to 0 and the overall metric may therefore be affected in turn.
+
+        # precision = [0, 1, 1, 2] / [2, 2, 1, 3] = [0, 0.5, 1, 0.66666667]
+        # recall    = [0, 1, 1, 2] / [0, 1, 1, 2] = [0, 1, 1, 1]
+        # classwise_f1 = [0, 1, 2, 1.33333334]/[0, 1.5, 1, 1.66666667] = [0, 0.66666667, 1, 0.8]
+        # mean = 2.47/4 = 0.6166666681
+
+        # ----------------------- For Custom F1 Metric ----------------------------
+        # Perform masking as first step to take only class with positive labels
+        # mask = [False, True, True, True]
+        # precision = [1, 1, 2] / [2, 1, 3] = [0.5, 1, 0.66666667]
+        # recall    = [1, 1, 2] / [1, 1, 2] = [1, 1, 1]
+        # classwise_f1 = [1, 2, 1.33334] / [1.5, 1, 1.67] = [0.66666667, 1, 0.8]
+        # mean = 2.47/3 = 0.8222222241 (because of masking we averaging with across positive labels only)
+
+        self.assertAlmostEqual(macro_f1_custom_score, 0.8222222241, places=4)
+        self.assertNotAlmostEqual(
+            macro_f1_custom_score, macro_f1_standard_score, places=4
+        )
 
     @staticmethod
     def __get_custom_and_standard_metric_scores(num_labels, preds, labels):