ChEB-AI
diff --git a/‎chebai/callbacks/epoch_metrics.py‎
Lines changed: 74 additions & 0 deletions b/‎chebai/callbacks/epoch_metrics.py‎
Lines changed: 74 additions & 0 deletions
diff --git a/‎chebai/cli.py‎
Lines changed: 5 additions & 2 deletions b/‎chebai/cli.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎chebai/loss/bce_weighted.py‎
Lines changed: 13 additions & 5 deletions b/‎chebai/loss/bce_weighted.py‎
Lines changed: 13 additions & 5 deletions
diff --git a/‎chebai/loss/semantic.py‎
Lines changed: 20 additions & 6 deletions b/‎chebai/loss/semantic.py‎
Lines changed: 20 additions & 6 deletions
diff --git a/‎chebai/models/base.py‎
Lines changed: 0 additions & 2 deletions b/‎chebai/models/base.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎chebai/preprocessing/datasets/base.py‎
Lines changed: 4 additions & 0 deletions b/‎chebai/preprocessing/datasets/base.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎chebai/preprocessing/datasets/chebi.py‎
Lines changed: 4 additions & 4 deletions b/‎chebai/preprocessing/datasets/chebi.py‎
Lines changed: 4 additions & 4 deletions
@@ -47,3 +47,77 @@ def compute(self):
         # if (precision and recall are 0) or (precision is nan), set f1 to 0
         classwise_f1 = classwise_f1.nan_to_num()
         return torch.mean(classwise_f1)
+
+
+class BalancedAccuracy(torchmetrics.Metric):
+    """Balanced Accuracy = (TPR + TNR) / 2 = ( TP/(TP + FN) + (TN)/(TN + FP) ) / 2
+
+    This metric computes the balanced accuracy, which is the average of true positive rate (TPR)
+    and true negative rate (TNR). It is useful for imbalanced datasets where the classes are not
+    represented equally.
+    """
+
+    def __init__(self, num_labels, dist_sync_on_step=False, threshold=0.5):
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+
+        self.add_state(
+            "true_positives",
+            default=torch.zeros(num_labels, dtype=torch.int),
+            dist_reduce_fx="sum",
+        )
+
+        self.add_state(
+            "false_positives",
+            default=torch.zeros(num_labels, dtype=torch.int),
+            dist_reduce_fx="sum",
+        )
+
+        self.add_state(
+            "true_negatives",
+            default=torch.zeros(num_labels, dtype=torch.int),
+            dist_reduce_fx="sum",
+        )
+
+        self.add_state(
+            "false_negatives",
+            default=torch.zeros(num_labels, dtype=torch.int),
+            dist_reduce_fx="sum",
+        )
+
+        self.threshold = threshold
+
+    def update(self, preds: torch.Tensor, labels: torch.Tensor):
+        """Update the TPs, TNs ,FPs and FNs"""
+
+        # Size: Batch_size x Num_of_Classes;
+        # summing over 1st dimension (dim=0), gives us the True positives per class
+        tps = torch.sum(
+            torch.logical_and(preds > self.threshold, labels.to(torch.bool)), dim=0
+        )
+        fps = torch.sum(
+            torch.logical_and(preds > self.threshold, ~labels.to(torch.bool)), dim=0
+        )
+        tns = torch.sum(
+            torch.logical_and(preds <= self.threshold, ~labels.to(torch.bool)), dim=0
+        )
+        fns = torch.sum(
+            torch.logical_and(preds <= self.threshold, labels.to(torch.bool)), dim=0
+        )
+
+        # Size: Num_of_Classes;
+        self.true_positives += tps
+        self.false_positives += fps
+        self.true_negatives += tns
+        self.false_negatives += fns
+
+    def compute(self):
+        """Compute the average value of Balanced accuracy from each batch"""
+
+        tpr = self.true_positives / (self.true_positives + self.false_negatives)
+        tnr = self.true_negatives / (self.true_negatives + self.false_positives)
+        # Convert the nan values to 0
+        tpr = tpr.nan_to_num()
+        tnr = tnr.nan_to_num()
+
+        balanced_acc = (tpr + tnr) / 2
+        return torch.mean(balanced_acc)
@@ -11,14 +11,17 @@ def __init__(self, *args, **kwargs):
 
     def add_arguments_to_parser(self, parser: LightningArgumentParser):
         for kind in ("train", "val", "test"):
-            for average in ("micro", "macro"):
+            for average in ("micro-f1", "macro-f1", "balanced-accuracy"):
                 parser.link_arguments(
                     "model.init_args.out_dim",
-                    f"model.init_args.{kind}_metrics.init_args.metrics.{average}-f1.init_args.num_labels",
+                    f"model.init_args.{kind}_metrics.init_args.metrics.{average}.init_args.num_labels",
                 )
         parser.link_arguments(
             "model.init_args.out_dim", "trainer.callbacks.init_args.num_labels"
         )
+        parser.link_arguments(
+            "data", "model.init_args.criterion.init_args.data_extractor"
+        )
 
     @staticmethod
     def subcommands() -> Dict[str, Set[str]]:
 
@@ -1,5 +1,6 @@
 import torch
-from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor
+from chebai.preprocessing.datasets.base import XYBaseDataModule
+from chebai.preprocessing.datasets.pubchem import LabeledUnlabeledMixed
 import pandas as pd
 import os
 import pickle
@@ -10,9 +11,16 @@ class BCEWeighted(torch.nn.BCEWithLogitsLoss):
     https://openaccess.thecvf.com/content_CVPR_2019/papers/Cui_Class-Balanced_Loss_Based_on_Effective_Number_of_Samples_CVPR_2019_paper.pdf)
     """
 
-    def __init__(self, beta: float = None, data_extractor: _ChEBIDataExtractor = None):
+    def __init__(
+        self,
+        beta: float = None,
+        data_extractor: XYBaseDataModule = None,
+    ):
         self.beta = beta
+        if isinstance(data_extractor, LabeledUnlabeledMixed):
+            data_extractor = data_extractor.labeled
         self.data_extractor = data_extractor
+
         super().__init__()
 
     def set_pos_weight(self, input):
@@ -27,16 +35,16 @@ def set_pos_weight(self, input):
         ):
             complete_data = pd.concat(
                 [
-                    pickle.load(
+                    pd.read_pickle(
                         open(
                             os.path.join(
                                 self.data_extractor.raw_dir,
-                                self.data_extractor.raw_file_names_dict[set],
+                                raw_file_name,
                             ),
                             "rb",
                         )
                     )
-                    for set in ["train", "validation", "test"]
+                    for raw_file_name in self.data_extractor.raw_file_names
                 ]
             )
             value_counts = []
 
@@ -7,20 +7,29 @@
 from typing import Literal
 
 from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor, ChEBIOver100
+from chebai.preprocessing.datasets.pubchem import LabeledUnlabeledMixed
+from chebai.loss.bce_weighted import BCEWeighted
 
 
 class ImplicationLoss(torch.nn.Module):
     def __init__(
         self,
-        data_extractor: _ChEBIDataExtractor,
+        data_extractor: _ChEBIDataExtractor | LabeledUnlabeledMixed,
         base_loss: torch.nn.Module = None,
         tnorm: Literal["product", "lukasiewicz", "xu19"] = "product",
         impl_loss_weight=0.1,  # weight of implication loss in relation to base_loss
         pos_scalar=1,
         pos_epsilon=0.01,
+        multiply_by_softmax=False,
     ):
         super().__init__()
+        # automatically choose labeled subset for implication filter in case of mixed dataset
+        if isinstance(data_extractor, LabeledUnlabeledMixed):
+            data_extractor = data_extractor.labeled
         self.data_extractor = data_extractor
+        # propagate data_extractor to base loss
+        if isinstance(base_loss, BCEWeighted):
+            base_loss.data_extractor = self.data_extractor
         self.base_loss = base_loss
         self.implication_cache_file = f"implications_{self.data_extractor.name}.cache"
         self.label_names = _load_label_names(
@@ -36,6 +45,7 @@ def __init__(
         self.impl_weight = impl_loss_weight
         self.pos_scalar = pos_scalar
         self.eps = pos_epsilon
+        self.multiply_by_softmax = multiply_by_softmax
 
     def forward(self, input, target, **kwargs):
         nnl = kwargs.pop("non_null_labels", None)
@@ -70,16 +80,20 @@ def _calculate_implication_loss(self, l, r):
                 math.pow(1 + self.eps, 1 / self.pos_scalar)
                 - math.pow(self.eps, 1 / self.pos_scalar)
             )
-            r = torch.pow(r, self.pos_scalar)
+            one_min_r = torch.pow(1 - r, self.pos_scalar)
+        else:
+            one_min_r = 1 - r
         if self.tnorm == "product":
-            individual_loss = l * (1 - r)
+            individual_loss = l * one_min_r
         elif self.tnorm == "xu19":
-            individual_loss = -torch.log(1 - l * (1 - r))
+            individual_loss = -torch.log(1 - l * one_min_r)
         elif self.tnorm == "lukasiewicz":
-            individual_loss = torch.relu(l - r)
+            individual_loss = torch.relu(l + one_min_r - 1)
         else:
             raise NotImplementedError(f"Unknown tnorm {self.tnorm}")
 
+        if self.multiply_by_softmax:
+            individual_loss = individual_loss * individual_loss.softmax(dim=-1)
         return torch.mean(
             torch.sum(individual_loss, dim=-1),
             dim=0,
@@ -100,7 +114,7 @@ class DisjointLoss(ImplicationLoss):
     def __init__(
         self,
         path_to_disjointness,
-        data_extractor: _ChEBIDataExtractor,
+        data_extractor: _ChEBIDataExtractor | LabeledUnlabeledMixed,
         base_loss: torch.nn.Module = None,
         disjoint_loss_weight=100,
         **kwargs,
 
@@ -28,11 +28,9 @@ class ChebaiBaseNet(LightningModule):
 
     Attributes:
         NAME (str): The name of the model.
-        LOSS (torch.nn.Module): The loss function used by the model.
     """
 
     NAME = None
-    LOSS = torch.nn.BCEWithLogitsLoss
 
     def __init__(
         self,
 
@@ -313,6 +313,10 @@ def setup_processed(self):
     def processed_file_names(self):
         raise NotImplementedError
 
+    @property
+    def raw_file_names(self):
+        raise NotImplementedError
+
     @property
     def processed_file_names_dict(self) -> dict:
         raise NotImplementedError
 
@@ -192,7 +192,7 @@ def graph_to_raw_dataset(self, g, split_name=None):
         return data
 
     def save_raw(self, data: pd.DataFrame, filename: str):
-        pickle.dump(data, open(os.path.join(self.raw_dir, filename), "wb"))
+        pd.to_pickle(data, open(os.path.join(self.raw_dir, filename), "wb"))
 
     def _load_dict(self, input_file_path):
         """
@@ -205,7 +205,7 @@ def _load_dict(self, input_file_path):
             dict: The dictionary, keys are `features`, `labels` and `ident`.
         """
         with open(input_file_path, "rb") as input_file:
-            df = pickle.load(input_file)
+            df = pd.read_pickle(input_file)
             if self.single_class is not None:
                 single_cls_index = list(df.columns).index(int(self.single_class))
             for row in df.values:
@@ -218,7 +218,7 @@ def _load_dict(self, input_file_path):
     @staticmethod
     def _get_data_size(input_file_path):
         with open(input_file_path, "rb") as f:
-            return len(pickle.load(f))
+            return len(pd.read_pickle(f))
 
     def _setup_pruned_test_set(self):
         """Create test set with same leaf nodes, but use classes that appear in train set"""
@@ -468,7 +468,7 @@ def prepare_data(self, *args, **kwargs):
                 with open(
                     os.path.join(self.raw_dir, self.raw_file_names_dict["test"]), "rb"
                 ) as input_file:
-                    test_df = pickle.load(input_file)
+                    test_df = pd.read_pickle(input_file)
             # create train/val split based on test set
             chebi_path = self._load_chebi(
                 self.chebi_version_train