ChEB-AI
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎chebai/models/base.py‎
Lines changed: 4 additions & 3 deletions b/‎chebai/models/base.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎chebai/preprocessing/bin/graph_properties/tokens.txt‎ b/‎chebai/preprocessing/bin/graph_properties/tokens.txt‎
diff --git a/‎chebai/preprocessing/datasets/base.py‎
Lines changed: 17 additions & 13 deletions b/‎chebai/preprocessing/datasets/base.py‎
Lines changed: 17 additions & 13 deletions
diff --git a/‎chebai/result/analyse_sem.py‎
Lines changed: 66 additions & 40 deletions b/‎chebai/result/analyse_sem.py‎
Lines changed: 66 additions & 40 deletions
@@ -175,3 +175,4 @@ chebai.egg-info
 lightning_logs
 logs
 .isort.cfg
+/.vscode
@@ -1,9 +1,9 @@
 import logging
+from abc import ABC, abstractmethod
 from typing import Any, Dict, Iterable, Optional, Union
 
 import torch
 from lightning.pytorch.core.module import LightningModule
-from torchmetrics import Metric
 
 from chebai.preprocessing.structures import XYData
 
@@ -12,7 +12,7 @@
 _MODEL_REGISTRY = dict()
 
 
-class ChebaiBaseNet(LightningModule):
+class ChebaiBaseNet(LightningModule, ABC):
     """
     Base class for Chebai neural network models inheriting from PyTorch Lightning's LightningModule.
 
@@ -356,6 +356,7 @@ def _log_metrics(self, prefix: str, metrics: torch.nn.Module, batch_size: int):
                     logger=True,
                 )
 
+    @abstractmethod
     def forward(self, x: Dict[str, Any]) -> torch.Tensor:
         """
         Defines the forward pass.
@@ -366,7 +367,7 @@ def forward(self, x: Dict[str, Any]) -> torch.Tensor:
         Returns:
             torch.Tensor: The model output.
         """
-        raise NotImplementedError
+        pass
 
     def configure_optimizers(self, **kwargs) -> torch.optim.Optimizer:
         """
 
@@ -29,7 +29,8 @@ class XYBaseDataModule(LightningDataModule):
 
     Args:
         batch_size (int): The batch size for data loading. Default is 1.
-        train_split (float): The ratio of training data to total data and of test data to (validation + test) data. Default is 0.85.
+        test_split (float): The ratio of test data to total data. Default is 0.1.
+        validation_split (float): The ratio of validation data to total data. Default is 0.05.
         reader_kwargs (dict): Additional keyword arguments to be passed to the data reader. Default is None.
         prediction_kind (str): The kind of prediction to be performed (only relevant for the predict_dataloader). Default is "test".
         data_limit (Optional[int]): The maximum number of data samples to load. If set to None, the complete dataset will be used. Default is None.
@@ -45,7 +46,8 @@ class XYBaseDataModule(LightningDataModule):
     Attributes:
         READER (DataReader): The data reader class to use.
         reader (DataReader): An instance of the data reader class.
-        train_split (float): The ratio of training data to total data.
+        test_split (float): The ratio of test data to total data.
+        validation_split (float): The ratio of validation data to total data.
         batch_size (int): The batch size for data loading.
         prediction_kind (str): The kind of prediction to be performed.
         data_limit (Optional[int]): The maximum number of data samples to load.
@@ -68,7 +70,8 @@ class XYBaseDataModule(LightningDataModule):
     def __init__(
         self,
         batch_size: int = 1,
-        train_split: float = 0.85,
+        test_split: Optional[float] = 0.1,
+        validation_split: Optional[float] = 0.05,
         reader_kwargs: Optional[dict] = None,
         prediction_kind: str = "test",
         data_limit: Optional[int] = None,
@@ -86,7 +89,9 @@ def __init__(
         if reader_kwargs is None:
             reader_kwargs = dict()
         self.reader = self.READER(**reader_kwargs)
-        self.train_split = train_split
+        self.test_split = test_split
+        self.validation_split = validation_split
+
         self.batch_size = batch_size
         self.prediction_kind = prediction_kind
         self.data_limit = data_limit
@@ -1022,15 +1027,13 @@ def get_test_split(
 
         labels_list = df["labels"].tolist()
 
-        test_size = 1 - self.train_split - (1 - self.train_split) ** 2
-
         if len(labels_list[0]) > 1:
             splitter = MultilabelStratifiedShuffleSplit(
-                n_splits=1, test_size=test_size, random_state=seed
+                n_splits=1, test_size=self.test_split, random_state=seed
             )
         else:
             splitter = StratifiedShuffleSplit(
-                n_splits=1, test_size=test_size, random_state=seed
+                n_splits=1, test_size=self.test_split, random_state=seed
             )
 
         train_indices, test_indices = next(splitter.split(labels_list, labels_list))
@@ -1083,16 +1086,17 @@ def get_train_val_splits_given_test(
 
             return folds
 
-        # scale val set size by 1/self.train_split to compensate for (hypothetical) test set size (1-self.train_split)
-        test_size = ((1 - self.train_split) ** 2) / self.train_split
-
         if len(labels_list_trainval[0]) > 1:
             splitter = MultilabelStratifiedShuffleSplit(
-                n_splits=1, test_size=test_size, random_state=seed
+                n_splits=1,
+                test_size=self.validation_split / (1 - self.test_split),
+                random_state=seed,
             )
         else:
             splitter = StratifiedShuffleSplit(
-                n_splits=1, test_size=test_size, random_state=seed
+                n_splits=1,
+                test_size=self.validation_split / (1 - self.test_split),
+                random_state=seed,
             )
 
         train_indices, validation_indices = next(
 
@@ -1,20 +1,21 @@
 import gc
-import sys
 import traceback
 from datetime import datetime
 from typing import List, LiteralString
 
+import pandas as pd
 from torchmetrics.functional.classification import (
     multilabel_auroc,
     multilabel_average_precision,
     multilabel_f1_score,
 )
-from utils import *
 
 from chebai.loss.semantic import DisjointLoss
+from chebai.models import Electra
 from chebai.preprocessing.datasets.base import _DynamicDataset
 from chebai.preprocessing.datasets.chebi import ChEBIOver100
 from chebai.preprocessing.datasets.pubchem import PubChemKMeans
+from chebai.result.utils import *
 
 DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
@@ -122,7 +123,7 @@ def load_preds_labels(
 def get_label_names(data_module):
     if os.path.exists(os.path.join(data_module.processed_dir_main, "classes.txt")):
         with open(os.path.join(data_module.processed_dir_main, "classes.txt")) as fin:
-            return [int(line.strip()) for line in fin]
+            return [line.strip() for line in fin]
     print(
         f"Failed to retrieve label names, {os.path.join(data_module.processed_dir_main, 'classes.txt')} not found"
     )
@@ -131,70 +132,97 @@ def get_label_names(data_module):
 
 def get_chebi_graph(data_module, label_names):
     if os.path.exists(os.path.join(data_module.raw_dir, "chebi.obo")):
-        chebi_graph = data_module.extract_class_hierarchy(
+        chebi_graph = data_module._extract_class_hierarchy(
             os.path.join(data_module.raw_dir, "chebi.obo")
         )
-        return chebi_graph.subgraph(label_names)
+        return chebi_graph.subgraph([int(n) for n in label_names])
     print(
         f"Failed to retrieve ChEBI graph, {os.path.join(data_module.raw_dir, 'chebi.obo')} not found"
     )
     return None
 
 
-def get_disjoint_groups():
-    disjoints_owl_file = os.path.join("data", "chebi-disjoints.owl")
-    with open(disjoints_owl_file, "r") as f:
-        plaintext = f.read()
-        segments = plaintext.split("<")
-        disjoint_pairs = []
-        left = None
-        for seg in segments:
-            if seg.startswith("rdf:Description ") or seg.startswith("owl:Class"):
-                left = int(seg.split('rdf:about="&obo;CHEBI_')[1].split('"')[0])
-            elif seg.startswith("owl:disjointWith"):
-                right = int(seg.split('rdf:resource="&obo;CHEBI_')[1].split('"')[0])
-                disjoint_pairs.append([left, right])
-
-        disjoint_groups = []
-        for seg in plaintext.split("<rdf:Description>"):
-            if "owl;AllDisjointClasses" in seg:
-                classes = seg.split('rdf:about="&obo;CHEBI_')[1:]
-                classes = [int(c.split('"')[0]) for c in classes]
-                disjoint_groups.append(classes)
+def get_disjoint_groups(disjoint_files):
+    if disjoint_files is None:
+        disjoint_files = os.path.join("data", "chebi-disjoints.owl")
+    disjoint_pairs, disjoint_groups = [], []
+    for file in disjoint_files:
+        if file.split(".")[-1] == "csv":
+            disjoint_pairs += pd.read_csv(file, header=None).values.tolist()
+        elif file.split(".")[-1] == "owl":
+            with open(file, "r") as f:
+                plaintext = f.read()
+                segments = plaintext.split("<")
+                disjoint_pairs = []
+                left = None
+                for seg in segments:
+                    if seg.startswith("rdf:Description ") or seg.startswith(
+                        "owl:Class"
+                    ):
+                        left = int(seg.split('rdf:about="&obo;CHEBI_')[1].split('"')[0])
+                    elif seg.startswith("owl:disjointWith"):
+                        right = int(
+                            seg.split('rdf:resource="&obo;CHEBI_')[1].split('"')[0]
+                        )
+                        disjoint_pairs.append([left, right])
+
+                disjoint_groups = []
+                for seg in plaintext.split("<rdf:Description>"):
+                    if "owl;AllDisjointClasses" in seg:
+                        classes = seg.split('rdf:about="&obo;CHEBI_')[1:]
+                        classes = [int(c.split('"')[0]) for c in classes]
+                        disjoint_groups.append(classes)
+        else:
+            raise NotImplementedError(
+                "Unsupported disjoint file format: " + file.split(".")[-1]
+            )
+
     disjoint_all = disjoint_pairs + disjoint_groups
     # one disjointness is commented out in the owl-file
     # (the correct way would be to parse the owl file and notice the comment symbols, but for this case, it should work)
-    disjoint_all.remove([22729, 51880])
-    print(f"Found {len(disjoint_all)} disjoint groups")
+    if [22729, 51880] in disjoint_all:
+        disjoint_all.remove([22729, 51880])
+    # print(f"Found {len(disjoint_all)} disjoint groups")
     return disjoint_all
 
 
 class PredictionSmoother:
     """Removes implication and disjointness violations from predictions"""
 
-    def __init__(self, dataset):
-        self.label_names = get_label_names(dataset)
+    def __init__(self, dataset, label_names=None, disjoint_files=None):
+        if label_names:
+            self.label_names = label_names
+        else:
+            self.label_names = get_label_names(dataset)
         self.chebi_graph = get_chebi_graph(dataset, self.label_names)
-        self.disjoint_groups = get_disjoint_groups()
+        self.disjoint_groups = get_disjoint_groups(disjoint_files)
 
     def __call__(self, preds):
-
         preds_sum_orig = torch.sum(preds)
-        print(f"Preds sum: {preds_sum_orig}")
-        # eliminate implication violations by setting each prediction to maximum of its successors
         for i, label in enumerate(self.label_names):
             succs = [
-                self.label_names.index(p) for p in self.chebi_graph.successors(label)
+                self.label_names.index(str(p))
+                for p in self.chebi_graph.successors(int(label))
             ] + [i]
             if len(succs) > 0:
+                if torch.max(preds[:, succs], dim=1).values > 0.5 and preds[:, i] < 0.5:
+                    print(
+                        f"Correcting prediction for {label} to max of subclasses {list(self.chebi_graph.successors(int(label)))}"
+                    )
+                    print(
+                        f"Original pred: {preds[:, i]}, successors: {preds[:, succs]}"
+                    )
                 preds[:, i] = torch.max(preds[:, succs], dim=1).values
-        print(f"Preds change (step 1): {torch.sum(preds) - preds_sum_orig}")
+        if torch.sum(preds) != preds_sum_orig:
+            print(f"Preds change (step 1): {torch.sum(preds) - preds_sum_orig}")
         preds_sum_orig = torch.sum(preds)
         # step 2: eliminate disjointness violations: for group of disjoint classes, set all except max to 0.49 (if it is not already lower)
         preds_bounded = torch.min(preds, torch.ones_like(preds) * 0.49)
         for disj_group in self.disjoint_groups:
             disj_group = [
-                self.label_names.index(g) for g in disj_group if g in self.label_names
+                self.label_names.index(str(g))
+                for g in disj_group
+                if g in self.label_names
             ]
             if len(disj_group) > 1:
                 old_preds = preds[:, disj_group]
@@ -211,14 +239,12 @@ def __call__(self, preds):
                     print(
                         f"disjointness group {[self.label_names[d] for d in disj_group]} changed {samples_changed} samples"
                     )
-        print(
-            f"Preds change after disjointness (step 2): {torch.sum(preds) - preds_sum_orig}"
-        )
         preds_sum_orig = torch.sum(preds)
         # step 3: disjointness violation removal may have caused new implication inconsistencies -> set each prediction to min of predecessors
         for i, label in enumerate(self.label_names):
             predecessors = [i] + [
-                self.label_names.index(p) for p in self.chebi_graph.predecessors(label)
+                self.label_names.index(str(p))
+                for p in self.chebi_graph.predecessors(int(label))
             ]
             lowest_predecessors = torch.min(preds[:, predecessors], dim=1)
             preds[:, i] = lowest_predecessors.values