Merge branch 'dev' into feature/pyproject.toml

sfluegel05 · web-flow · commit 4f1f9950beb9 · 2025-07-10T12:47:46.000+02:00
diff --git a/chebai/preprocessing/datasets/base.py b/chebai/preprocessing/datasets/base.py
@@ -29,7 +29,8 @@ class XYBaseDataModule(LightningDataModule):
 
     Args:
         batch_size (int): The batch size for data loading. Default is 1.
-        train_split (float): The ratio of training data to total data and of test data to (validation + test) data. Default is 0.85.
+        test_split (float): The ratio of test data to total data. Default is 0.1.
+        validation_split (float): The ratio of validation data to total data. Default is 0.05.
         reader_kwargs (dict): Additional keyword arguments to be passed to the data reader. Default is None.
         prediction_kind (str): The kind of prediction to be performed (only relevant for the predict_dataloader). Default is "test".
         data_limit (Optional[int]): The maximum number of data samples to load. If set to None, the complete dataset will be used. Default is None.
@@ -45,7 +46,8 @@ class XYBaseDataModule(LightningDataModule):
     Attributes:
         READER (DataReader): The data reader class to use.
         reader (DataReader): An instance of the data reader class.
-        train_split (float): The ratio of training data to total data.
+        test_split (float): The ratio of test data to total data.
+        validation_split (float): The ratio of validation data to total data.
         batch_size (int): The batch size for data loading.
         prediction_kind (str): The kind of prediction to be performed.
         data_limit (Optional[int]): The maximum number of data samples to load.
@@ -68,7 +70,8 @@ class XYBaseDataModule(LightningDataModule):
     def __init__(
         self,
         batch_size: int = 1,
-        train_split: float = 0.85,
+        test_split: Optional[float] = 0.1,
+        validation_split: Optional[float] = 0.05,
         reader_kwargs: Optional[dict] = None,
         prediction_kind: str = "test",
         data_limit: Optional[int] = None,
@@ -86,7 +89,9 @@ def __init__(
         if reader_kwargs is None:
             reader_kwargs = dict()
         self.reader = self.READER(**reader_kwargs)
-        self.train_split = train_split
+        self.test_split = test_split
+        self.validation_split = validation_split
+
         self.batch_size = batch_size
         self.prediction_kind = prediction_kind
         self.data_limit = data_limit
@@ -1022,15 +1027,13 @@ def get_test_split(
 
         labels_list = df["labels"].tolist()
 
-        test_size = 1 - self.train_split - (1 - self.train_split) ** 2
-
         if len(labels_list[0]) > 1:
             splitter = MultilabelStratifiedShuffleSplit(
-                n_splits=1, test_size=test_size, random_state=seed
+                n_splits=1, test_size=self.test_split, random_state=seed
             )
         else:
             splitter = StratifiedShuffleSplit(
-                n_splits=1, test_size=test_size, random_state=seed
+                n_splits=1, test_size=self.test_split, random_state=seed
             )
 
         train_indices, test_indices = next(splitter.split(labels_list, labels_list))
@@ -1083,16 +1086,17 @@ def get_train_val_splits_given_test(
 
             return folds
 
-        # scale val set size by 1/self.train_split to compensate for (hypothetical) test set size (1-self.train_split)
-        test_size = ((1 - self.train_split) ** 2) / self.train_split
-
         if len(labels_list_trainval[0]) > 1:
             splitter = MultilabelStratifiedShuffleSplit(
-                n_splits=1, test_size=test_size, random_state=seed
+                n_splits=1,
+                test_size=self.validation_split / (1 - self.test_split),
+                random_state=seed,
             )
         else:
             splitter = StratifiedShuffleSplit(
-                n_splits=1, test_size=test_size, random_state=seed
+                n_splits=1,
+                test_size=self.validation_split / (1 - self.test_split),
+                random_state=seed,
             )
 
         train_indices, validation_indices = next(
diff --git a/chebai/result/analyse_sem.py b/chebai/result/analyse_sem.py
@@ -1,25 +1,21 @@
 import gc
-import os
-import sys
 import traceback
 from datetime import datetime
-from typing import List, LiteralString, Optional, Tuple
+from typing import List, LiteralString
 
-import torch
-import wandb
+import pandas as pd
 from torchmetrics.functional.classification import (
     multilabel_auroc,
     multilabel_average_precision,
     multilabel_f1_score,
 )
-from utils import evaluate_model, get_checkpoint_from_wandb, load_results_from_buffer
 
 from chebai.loss.semantic import DisjointLoss
 from chebai.models import Electra
 from chebai.preprocessing.datasets.base import _DynamicDataset
 from chebai.preprocessing.datasets.chebi import ChEBIOver100
-
-# from chebai.preprocessing.datasets.pubchem import PubChemKMeans
+from chebai.preprocessing.datasets.pubchem import PubChemKMeans
+from chebai.result.utils import *
 
 DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 
@@ -127,7 +123,7 @@ def load_preds_labels(
 def get_label_names(data_module):
     if os.path.exists(os.path.join(data_module.processed_dir_main, "classes.txt")):
         with open(os.path.join(data_module.processed_dir_main, "classes.txt")) as fin:
-            return [int(line.strip()) for line in fin]
+            return [line.strip() for line in fin]
     print(
         f"Failed to retrieve label names, {os.path.join(data_module.processed_dir_main, 'classes.txt')} not found"
     )
@@ -136,69 +132,97 @@ def get_label_names(data_module):
 
 def get_chebi_graph(data_module, label_names):
     if os.path.exists(os.path.join(data_module.raw_dir, "chebi.obo")):
-        chebi_graph = data_module.extract_class_hierarchy(
+        chebi_graph = data_module._extract_class_hierarchy(
             os.path.join(data_module.raw_dir, "chebi.obo")
         )
-        return chebi_graph.subgraph(label_names)
+        return chebi_graph.subgraph([int(n) for n in label_names])
     print(
         f"Failed to retrieve ChEBI graph, {os.path.join(data_module.raw_dir, 'chebi.obo')} not found"
     )
     return None
 
 
-def get_disjoint_groups():
-    disjoints_owl_file = os.path.join("data", "chebi-disjoints.owl")
-    with open(disjoints_owl_file, "r") as f:
-        plaintext = f.read()
-        segments = plaintext.split("<")
-        disjoint_pairs = []
-        left = None
-        for seg in segments:
-            if seg.startswith("rdf:Description ") or seg.startswith("owl:Class"):
-                left = int(seg.split('rdf:about="&obo;CHEBI_')[1].split('"')[0])
-            elif seg.startswith("owl:disjointWith"):
-                right = int(seg.split('rdf:resource="&obo;CHEBI_')[1].split('"')[0])
-                disjoint_pairs.append([left, right])
-
-        disjoint_groups = []
-        for seg in plaintext.split("<rdf:Description>"):
-            if "owl;AllDisjointClasses" in seg:
-                classes = seg.split('rdf:about="&obo;CHEBI_')[1:]
-                classes = [int(c.split('"')[0]) for c in classes]
-                disjoint_groups.append(classes)
+def get_disjoint_groups(disjoint_files):
+    if disjoint_files is None:
+        disjoint_files = os.path.join("data", "chebi-disjoints.owl")
+    disjoint_pairs, disjoint_groups = [], []
+    for file in disjoint_files:
+        if file.split(".")[-1] == "csv":
+            disjoint_pairs += pd.read_csv(file, header=None).values.tolist()
+        elif file.split(".")[-1] == "owl":
+            with open(file, "r") as f:
+                plaintext = f.read()
+                segments = plaintext.split("<")
+                disjoint_pairs = []
+                left = None
+                for seg in segments:
+                    if seg.startswith("rdf:Description ") or seg.startswith(
+                        "owl:Class"
+                    ):
+                        left = int(seg.split('rdf:about="&obo;CHEBI_')[1].split('"')[0])
+                    elif seg.startswith("owl:disjointWith"):
+                        right = int(
+                            seg.split('rdf:resource="&obo;CHEBI_')[1].split('"')[0]
+                        )
+                        disjoint_pairs.append([left, right])
+
+                disjoint_groups = []
+                for seg in plaintext.split("<rdf:Description>"):
+                    if "owl;AllDisjointClasses" in seg:
+                        classes = seg.split('rdf:about="&obo;CHEBI_')[1:]
+                        classes = [int(c.split('"')[0]) for c in classes]
+                        disjoint_groups.append(classes)
+        else:
+            raise NotImplementedError(
+                "Unsupported disjoint file format: " + file.split(".")[-1]
+            )
+
     disjoint_all = disjoint_pairs + disjoint_groups
     # one disjointness is commented out in the owl-file
     # (the correct way would be to parse the owl file and notice the comment symbols, but for this case, it should work)
-    disjoint_all.remove([22729, 51880])
-    print(f"Found {len(disjoint_all)} disjoint groups")
+    if [22729, 51880] in disjoint_all:
+        disjoint_all.remove([22729, 51880])
+    # print(f"Found {len(disjoint_all)} disjoint groups")
     return disjoint_all
 
 
 class PredictionSmoother:
     """Removes implication and disjointness violations from predictions"""
 
-    def __init__(self, dataset):
-        self.label_names = get_label_names(dataset)
+    def __init__(self, dataset, label_names=None, disjoint_files=None):
+        if label_names:
+            self.label_names = label_names
+        else:
+            self.label_names = get_label_names(dataset)
         self.chebi_graph = get_chebi_graph(dataset, self.label_names)
-        self.disjoint_groups = get_disjoint_groups()
+        self.disjoint_groups = get_disjoint_groups(disjoint_files)
 
     def __call__(self, preds):
         preds_sum_orig = torch.sum(preds)
-        print(f"Preds sum: {preds_sum_orig}")
-        # eliminate implication violations by setting each prediction to maximum of its successors
         for i, label in enumerate(self.label_names):
             succs = [
-                self.label_names.index(p) for p in self.chebi_graph.successors(label)
+                self.label_names.index(str(p))
+                for p in self.chebi_graph.successors(int(label))
             ] + [i]
             if len(succs) > 0:
+                if torch.max(preds[:, succs], dim=1).values > 0.5 and preds[:, i] < 0.5:
+                    print(
+                        f"Correcting prediction for {label} to max of subclasses {list(self.chebi_graph.successors(int(label)))}"
+                    )
+                    print(
+                        f"Original pred: {preds[:, i]}, successors: {preds[:, succs]}"
+                    )
                 preds[:, i] = torch.max(preds[:, succs], dim=1).values
-        print(f"Preds change (step 1): {torch.sum(preds) - preds_sum_orig}")
+        if torch.sum(preds) != preds_sum_orig:
+            print(f"Preds change (step 1): {torch.sum(preds) - preds_sum_orig}")
         preds_sum_orig = torch.sum(preds)
         # step 2: eliminate disjointness violations: for group of disjoint classes, set all except max to 0.49 (if it is not already lower)
         preds_bounded = torch.min(preds, torch.ones_like(preds) * 0.49)
         for disj_group in self.disjoint_groups:
             disj_group = [
-                self.label_names.index(g) for g in disj_group if g in self.label_names
+                self.label_names.index(str(g))
+                for g in disj_group
+                if g in self.label_names
             ]
             if len(disj_group) > 1:
                 old_preds = preds[:, disj_group]
@@ -215,14 +239,12 @@ def __call__(self, preds):
                     print(
                         f"disjointness group {[self.label_names[d] for d in disj_group]} changed {samples_changed} samples"
                     )
-        print(
-            f"Preds change after disjointness (step 2): {torch.sum(preds) - preds_sum_orig}"
-        )
         preds_sum_orig = torch.sum(preds)
         # step 3: disjointness violation removal may have caused new implication inconsistencies -> set each prediction to min of predecessors
         for i, label in enumerate(self.label_names):
             predecessors = [i] + [
-                self.label_names.index(p) for p in self.chebi_graph.predecessors(label)
+                self.label_names.index(str(p))
+                for p in self.chebi_graph.predecessors(int(label))
             ]
             lowest_predecessors = torch.min(preds[:, predecessors], dim=1)
             preds[:, i] = lowest_predecessors.values
diff --git a/chebai/result/generate_class_properties.py b/chebai/result/generate_class_properties.py
@@ -15,7 +15,8 @@
 
 class ClassesPropertiesGenerator:
     """
-    Computes PPV (Positive Predictive Value) and NPV (Negative Predictive Value)
+    Computes PPV (Positive Predictive Value) and NPV (Negative Predictive Value) and counts the number of
+    true positives (TP), false positives (FP), true negatives (TN), and false negatives (FN)
     for each class in a multi-label classification problem using a PyTorch Lightning model.
     """
 
@@ -35,23 +36,25 @@ def load_class_labels(path: Path) -> list[str]:
             return [line.strip() for line in f if line.strip()]
 
     @staticmethod
-    def compute_tpv_npv(
+    def compute_classwise_scores(
         y_true: list[torch.Tensor],
         y_pred: list[torch.Tensor],
+        raw_preds: torch.Tensor,
         class_names: list[str],
     ) -> dict[str, dict[str, float]]:
         """
-        Compute TPV (precision) and NPV for each class in a multi-label setting.
+        Compute PPV (precision, TP/(TP+FP)), NPV (TN/(TN+FN)) and the number of TNs, FPs, FNs and TPs for each class
+        in a multi-label setting.
 
         Args:
             y_true: List of binary ground-truth label tensors, one tensor per sample.
             y_pred: List of binary prediction tensors, one tensor per sample.
             class_names: Ordered list of class names corresponding to class indices.
 
         Returns:
-            Dictionary mapping each class name to its TPV and NPV metrics:
+            Dictionary mapping each class name to its PPV and NPV metrics:
             {
-                "class_name": {"PPV": float, "NPV": float},
+                "class_name": {"PPV": float, "NPV": float, "TN": int, "FP": int, "FN": int, "TP": int},
                 ...
             }
         """
@@ -67,13 +70,17 @@ def compute_tpv_npv(
             tn, fp, fn, tp = cm[idx].ravel()
             tpv = tp / (tp + fp) if (tp + fp) > 0 else 0.0
             npv = tn / (tn + fn) if (tn + fn) > 0 else 0.0
+            # positive_raw = [p.item() for i, p in enumerate(raw_preds[:, idx]) if true_np[i, idx]]
+            # negative_raw = [p.item() for i, p in enumerate(raw_preds[:, idx]) if not true_np[i, idx]]
             results[cls_name] = {
                 "PPV": round(tpv, 4),
                 "NPV": round(npv, 4),
                 "TN": int(tn),
                 "FP": int(fp),
                 "FN": int(fn),
                 "TP": int(tp),
+                # "positive_preds": positive_raw,
+                # "negative_preds": negative_raw,
             }
         return results
 
@@ -125,6 +132,7 @@ def generate_props(
         print("Running inference on validation data...")
 
         y_true, y_pred = [], []
+        raw_preds = []
         for batch_idx, batch in enumerate(val_loader):
             data = model._process_batch(  # pylint: disable=W0212
                 batch, batch_idx=batch_idx
@@ -135,20 +143,21 @@ def generate_props(
             preds = torch.sigmoid(logits) > 0.5
             y_pred.extend(preds)
             y_true.extend(labels)
-
-        print("Computing TPV and NPV metrics...")
+            raw_preds.extend(torch.sigmoid(logits))
+        raw_preds = torch.stack(raw_preds)
+        print("Computing metrics...")
         classes_file = Path(data_module.processed_dir_main) / "classes.txt"
         if output_path is None:
             output_file = Path(data_module.processed_dir_main) / "classes.json"
         else:
             output_file = Path(output_path)
 
         class_names = self.load_class_labels(classes_file)
-        metrics = self.compute_tpv_npv(y_true, y_pred, class_names)
+        metrics = self.compute_classwise_scores(y_true, y_pred, raw_preds, class_names)
 
         with output_file.open("w") as f:
             json.dump(metrics, f, indent=2)
-        print(f"Saved TPV/NPV metrics to {output_file}")
+        print(f"Saved metrics to {output_file}")
 
 
 class Main:
@@ -164,7 +173,7 @@ def generate(
         output_path: str | None = None,
     ) -> None:
         """
-        CLI command to generate TPV/NPV JSON.
+        CLI command to generate JSON with metrics on validation set.
 
         Args:
             model_ckpt_path: Path to the PyTorch Lightning checkpoint file.