Merge pull request #63 from SFI-Visual-Intelligence/Jan-metrics

hzavadil98 · web-flow · commit 27f120cf95dc · 2025-02-10T16:21:19.000+01:00
Merging the metrics updates with dataloader updates
diff --git a/main.py b/main.py
@@ -46,7 +46,7 @@ def main():
         val_size=args.val_size,
     )
 
-    metrics = MetricWrapper(*args.metric, num_classes=traindata.num_classes)
+    metrics = MetricWrapper(*args.metric, num_classes=traindata.num_classes, macro_averaging=args.macro_averaging)
 
     # Find the shape of the data, if is 2D, add a channel dimension
     data_shape = traindata[0][0].shape
diff --git a/utils/arg_parser.py b/utils/arg_parser.py
@@ -68,6 +68,12 @@ def get_args():
         nargs="+",
         help="Which metric to use for evaluation",
     )
+    parser.add_argument(
+        "--macro_averaging",
+        action="store_true",
+        help="If the flag is included, the metrics will be calculated using macro averaging.",
+    )
+        
 
     parser.add_argument("--imagesize", type=int, default=28, help="Imagesize")
 
@@ -108,7 +114,7 @@ def get_args():
     parser.add_argument(
         "--dry_run",
         action="store_true",
-        help="If true, the code will not run the training loop.",
+        help="If the flag is included, the code will not run the training loop.",
     )
     args = parser.parse_args()
 
diff --git a/utils/load_metric.py b/utils/load_metric.py
@@ -45,10 +45,11 @@ class MetricWrapper(nn.Module):
     {'entropy': [], 'f1': [], 'precision': []}
     """
 
-    def __init__(self, *metrics, num_classes):
+    def __init__(self, *metrics, num_classes, macro_averaging=False):
         super().__init__()
         self.metrics = {}
         self.num_classes = num_classes
+        self.macro_averaging = macro_averaging
 
         for metric in metrics:
             self.metrics[metric] = self._get_metric(metric)
@@ -74,13 +75,13 @@ def _get_metric(self, key):
             case "entropy":
                 return EntropyPrediction(num_classes=self.num_classes)
             case "f1":
-                return F1Score(num_classes=self.num_classes)
+                return F1Score(num_classes=self.num_classes, macro_averaging=self.macro_averaging)
             case "recall":
-                return Recall(num_classes=self.num_classes)
+                return Recall(num_classes=self.num_classes, macro_averaging=self.macro_averaging)
             case "precision":
-                return Precision(num_classes=self.num_classes)
+                return Precision(num_classes=self.num_classes, macro_averaging=self.macro_averaging)
             case "accuracy":
-                return Accuracy(num_classes=self.num_classes)
+                return Accuracy(num_classes=self.num_classes, macro_averaging=self.macro_averaging)
             case _:
                 raise ValueError(f"Metric {key} not supported")
 
diff --git a/utils/metrics/F1.py b/utils/metrics/F1.py
@@ -4,29 +4,39 @@
 
 class F1Score(nn.Module):
     """
-    F1 Score implementation with direct averaging inside the compute method.
+    F1 Score implementation with support for both macro and micro averaging.
+
+    This class computes the F1 score during training using either macro or micro averaging.
+    The F1 score is calculated based on the true positives (TP), false positives (FP),
+    and false negatives (FN) for each class.
 
     Parameters
     ----------
     num_classes : int
-        Number of classes.
+        The number of classes in the classification task.
+
+    macro_averaging : bool, optional, default=False
+        If True, computes the macro-averaged F1 score. If False, computes the micro-averaged F1 score.
 
     Attributes
     ----------
     num_classes : int
-        The number of classes.
+        The number of classes in the classification task.
 
     tp : torch.Tensor
-        Tensor for True Positives (TP) for each class.
+        Tensor storing the count of True Positives (TP) for each class.
 
     fp : torch.Tensor
-        Tensor for False Positives (FP) for each class.
+        Tensor storing the count of False Positives (FP) for each class.
 
     fn : torch.Tensor
-        Tensor for False Negatives (FN) for each class.
+        Tensor storing the count of False Negatives (FN) for each class.
+
+    macro_averaging : bool
+        A flag indicating whether to compute the macro-averaged F1 score or not.
     """
 
-    def __init__(self, num_classes):
+    def __init__(self, num_classes, macro_averaging=False):
         """
         Initializes the F1Score object, setting up the necessary state variables.
 
@@ -35,28 +45,81 @@ def __init__(self, num_classes):
         num_classes : int
             The number of classes in the classification task.
 
+        macro_averaging : bool, optional, default=False
+            If True, computes the macro-averaged F1 score. If False, computes the micro-averaged F1 score.
         """
-
         super().__init__()
 
         self.num_classes = num_classes
+        self.macro_averaging = macro_averaging
 
-        # Initialize  variables for True Positives (TP), False Positives (FP), and False Negatives (FN)
+        # Initialize variables for True Positives (TP), False Positives (FP), and False Negatives (FN)
         self.tp = torch.zeros(num_classes)
         self.fp = torch.zeros(num_classes)
         self.fn = torch.zeros(num_classes)
 
-    def update(self, preds, target):
+    def _micro_F1(self):
+        """
+        Compute the Micro F1 score by aggregating TP, FP, and FN across all classes.
+
+        Micro F1 score is calculated globally by considering all predictions together, regardless of class.
+
+        Returns
+        -------
+        torch.Tensor
+            The micro-averaged F1 score.
         """
-        Update the variables with predictions and true labels.
+        tp = torch.sum(self.tp)
+        fp = torch.sum(self.fp)
+        fn = torch.sum(self.fn)
+
+        precision = tp / (tp + fp + 1e-8)  # Avoid division by zero
+        recall = tp / (tp + fn + 1e-8)  # Avoid division by zero
+
+        f1 = 2 * precision * recall / (precision + recall + 1e-8)  # Avoid division by zero
+        return f1
+
+    def _macro_F1(self):
+        """
+        Compute the Macro F1 score by calculating the F1 score per class and averaging.
+
+        Macro F1 score is calculated as the average of per-class F1 scores. This approach treats all classes equally,
+        regardless of their frequency.
+
+        Returns
+        -------
+        torch.Tensor
+            The macro-averaged F1 score.
+        """
+        precision_per_class = self.tp / (self.tp + self.fp + 1e-8)  # Avoid division by zero
+        recall_per_class = self.tp / (self.tp + self.fn + 1e-8)  # Avoid division by zero
+        f1_per_class = 2 * precision_per_class * recall_per_class / (
+                    precision_per_class + recall_per_class + 1e-8)  # Avoid division by zero
+
+        # Take the average of F1 scores across all classes
+        f1_score = torch.mean(f1_per_class)
+        return f1_score
+
+    def forward(self, preds, target):
+        """
+        Update the True Positives, False Positives, and False Negatives, and compute the F1 score.
+
+        This method computes the F1 score based on the predictions and true labels. It can compute either the
+        macro-averaged or micro-averaged F1 score, depending on the `macro_averaging` flag.
 
         Parameters
         ----------
         preds : torch.Tensor
-            Predicted logits (shape: [batch_size, num_classes]).
+            Predicted logits or class indices (shape: [batch_size, num_classes]).
+            These logits are typically the output of a softmax or sigmoid activation.
 
         target : torch.Tensor
-            True labels (shape: [batch_size]).
+            True labels (shape: [batch_size]), where each element is an integer representing the true class.
+
+        Returns
+        -------
+        torch.Tensor
+            The computed F1 score (either micro or macro, based on `macro_averaging`).
         """
         preds = torch.argmax(preds, dim=1)
 
@@ -66,21 +129,11 @@ def update(self, preds, target):
             self.fp[i] += torch.sum((preds == i) & (target != i)).float()
             self.fn[i] += torch.sum((preds != i) & (target == i)).float()
 
-    def compute(self):
-        """
-        Compute the F1 score.
+        if self.macro_averaging:
+            # Calculate Macro F1 score
+            f1_score = self._macro_F1()
+        else:
+            # Calculate Micro F1 score
+            f1_score = self._micro_F1()
 
-        Returns
-        -------
-        torch.Tensor
-           The computed F1 score.
-        """
-
-        # Compute F1 score based on the specified averaging method
-        f1_score = (
-            2
-            * torch.sum(self.tp)
-            / (2 * torch.sum(self.tp) + torch.sum(self.fp) + torch.sum(self.fn))
-        )
-
-        return f1_score
+        return f1_score
diff --git a/utils/metrics/accuracy.py b/utils/metrics/accuracy.py
@@ -3,10 +3,11 @@
 
 
 class Accuracy(nn.Module):
-    def __init__(self, num_classes):
+    def __init__(self, num_classes, macro_averaging=False):
         super().__init__()
         self.num_classes = num_classes
-
+        self.macro_averaging = macro_averaging
+        
     def forward(self, y_true, y_pred):
         """
         Compute the accuracy of the model.
@@ -23,12 +24,71 @@ def forward(self, y_true, y_pred):
         float
             Accuracy score.
         """
+        if y_pred.dim() > 1:
+            y_pred = y_pred.argmax(dim=1)
+        if self.macro_averaging:
+            return self._macro_acc(y_true, y_pred)
+        else:
+            return self._micro_acc(y_true, y_pred)
+    
+    def _macro_acc(self, y_true, y_pred):
+        """
+        Compute the macro-average accuracy.
+
+        Parameters
+        ----------
+        y_true : torch.Tensor
+            True labels.
+        y_pred : torch.Tensor
+            Predicted labels.
+
+        Returns
+        -------
+        float
+            Macro-average accuracy score.
+        """
+        y_true, y_pred = y_true.flatten(), y_pred.flatten()  # Ensure 1D shape
+
+        classes = torch.unique(y_true)  # Find unique class labels
+        acc_per_class = []
+        
+        for c in classes:
+            mask = (y_true == c)  # Mask for class c
+            acc = (y_pred[mask] == y_true[mask]).float().mean()  # Accuracy for class c
+            acc_per_class.append(acc)
+        
+        macro_acc = torch.stack(acc_per_class).mean().item()  # Average across classes
+        return macro_acc
+    
+    def _micro_acc(self, y_true, y_pred):
+        """
+        Compute the micro-average accuracy.
+
+        Parameters
+        ----------
+        y_true : torch.Tensor
+            True labels.
+        y_pred : torch.Tensor
+            Predicted labels.
+
+        Returns
+        -------
+        float
+            Micro-average accuracy score.
+        """
         return (y_true == y_pred).float().mean().item()
 
 
 if __name__ == "__main__":
+    accuracy = Accuracy(5)
+    macro_accuracy = Accuracy(5, macro_averaging=True)
+    
     y_true = torch.tensor([0, 3, 2, 3, 4])
     y_pred = torch.tensor([0, 1, 2, 3, 4])
-
-    accuracy = Accuracy()
     print(accuracy(y_true, y_pred))
+    print(macro_accuracy(y_true, y_pred))
+    
+    y_true = torch.tensor([0, 3, 2, 3, 4])
+    y_onehot_pred = torch.tensor([[1, 0, 0, 0, 0], [0, 1, 0, 0, 0], [0, 0, 1, 0, 0], [0, 0, 0, 1, 0], [0, 0, 0, 0, 1]])
+    print(accuracy(y_true, y_onehot_pred))
+    print(macro_accuracy(y_true, y_onehot_pred))

Original file line number	Diff line number	Diff line change
`@@ -46,7 +46,7 @@ def main():`
`46`	`46`	`val_size=args.val_size,`
`47`	`47`	`)`
`48`	`48`
`49`		`- metrics = MetricWrapper(*args.metric, num_classes=traindata.num_classes)`
	`49`	`+ metrics = MetricWrapper(*args.metric, num_classes=traindata.num_classes, macro_averaging=args.macro_averaging)`
`50`	`50`
`51`	`51`	`# Find the shape of the data, if is 2D, add a channel dimension`
`52`	`52`	`data_shape = traindata[0][0].shape`