ChEB-AI
diff --git a/‎.github/workflows/test.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/test.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎README.md‎
Lines changed: 15 additions & 2 deletions b/‎README.md‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎chebai/loggers/custom.py‎
Lines changed: 2 additions & 1 deletion b/‎chebai/loggers/custom.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎chebai/loss/bce_weighted.py‎
Lines changed: 2 additions & 1 deletion b/‎chebai/loss/bce_weighted.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎chebai/loss/semantic.py‎
Lines changed: 7 additions & 3 deletions b/‎chebai/loss/semantic.py‎
Lines changed: 7 additions & 3 deletions
diff --git a/‎chebai/models/ffn.py‎
Lines changed: 1 addition & 2 deletions b/‎chebai/models/ffn.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎chebai/preprocessing/datasets/base.py‎
Lines changed: 50 additions & 15 deletions b/‎chebai/preprocessing/datasets/base.py‎
Lines changed: 50 additions & 15 deletions
@@ -9,7 +9,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: ["3.9", "3.10", "3.11", "3.12"]
+        python-version: ["3.10", "3.11", "3.12"]
 
     steps:
       - uses: actions/checkout@v4
@@ -24,7 +24,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install --upgrade pip setuptools wheel
           python -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
-          python -m pip install -e .
+          python -m pip install -e .[dev]
 
       - name: Display Python & Installed Packages
         run: |
 
@@ -78,8 +78,21 @@ The `classes_path` is the path to the dataset's `raw/classes.txt` file that cont
 
 ## Evaluation
 
-An example for evaluating a model trained on the ontology extension task is given in `tutorials/eval_model_basic.ipynb`.
-It takes in the finetuned model as input for performing the evaluation.
+You can evaluate a model trained on the ontology extension task in one of two ways:
+
+### 1. Using the Jupyter Notebook
+An example notebook is provided at `tutorials/eval_model_basic.ipynb`.  
+- Load your finetuned model and run the evaluation cells to compute metrics on the test set.
+
+### 2. Using the Lightning CLI
+Alternatively, you can evaluate the model via the CLI:
+
+```bash
+python -m chebai test --trainer=configs/training/default_trainer.yml --trainer.devices=1 --trainer.num_nodes=1 --ckpt_path=[path-to-finetuned-model] --model=configs/model/electra.yml --model.test_metrics=configs/metrics/micro-macro-f1.yml --data=configs/data/chebi/chebi50.yml --data.init_args.batch_size=32 --data.init_args.num_workers=10 --data.init_args.chebi_version=[chebi-version] --model.pass_loss_kwargs=false --model.criterion=configs/loss/bce.yml --model.criterion.init_args.beta=0.99 --data.init_args.splits_file_path=[path-to-splits-file]
+```
+
+> **Note**: It is recommended to use `devices=1` and `num_nodes=1` during testing; multi-device settings use a `DistributedSampler`, which may replicate some samples to maintain equal batch sizes, so using a single device ensures that each sample or batch is evaluated exactly once.
+
 
 ## Cross-validation
 You can do inner k-fold cross-validation, i.e., train models on k train-validation splits that all use the same test
 
@@ -2,7 +2,6 @@
 from datetime import datetime
 from typing import List, Literal, Optional, Union
 
-import wandb
 from lightning.fabric.utilities.types import _PATH
 from lightning.pytorch.callbacks import ModelCheckpoint
 from lightning.pytorch.loggers import WandbLogger
@@ -105,6 +104,8 @@ def set_fold(self, fold: int) -> None:
         Args:
             fold (int): Cross-validation fold number.
         """
+        import wandb
+
         if fold != self._fold:
             self._fold = fold
             # Start new experiment
 
@@ -5,7 +5,6 @@
 
 from chebai.preprocessing.datasets.base import XYBaseDataModule
 from chebai.preprocessing.datasets.chebi import _ChEBIDataExtractor
-from chebai.preprocessing.datasets.pubchem import LabeledUnlabeledMixed
 
 
 class BCEWeighted(torch.nn.BCEWithLogitsLoss):
@@ -27,6 +26,8 @@ def __init__(
         data_extractor: Optional[XYBaseDataModule] = None,
         **kwargs,
     ):
+        from chebai.preprocessing.datasets.pubchem import LabeledUnlabeledMixed
+
         self.beta = beta
         if isinstance(data_extractor, LabeledUnlabeledMixed):
             data_extractor = data_extractor.labeled
 
@@ -2,14 +2,16 @@
 import math
 import os
 import pickle
-from typing import List, Literal, Union
+from typing import TYPE_CHECKING, List, Literal, Union
 
 import torch
 
 from chebai.loss.bce_weighted import BCEWeighted
 from chebai.preprocessing.datasets.base import XYBaseDataModule
 from chebai.preprocessing.datasets.chebi import ChEBIOver100, _ChEBIDataExtractor
-from chebai.preprocessing.datasets.pubchem import LabeledUnlabeledMixed
+
+if TYPE_CHECKING:
+    from chebai.preprocessing.datasets.pubchem import LabeledUnlabeledMixed
 
 
 class ImplicationLoss(torch.nn.Module):
@@ -68,6 +70,8 @@ def __init__(
         multiply_with_base_loss: bool = True,
         no_grads: bool = False,
     ):
+        from chebai.preprocessing.datasets.pubchem import LabeledUnlabeledMixed
+
         super().__init__()
         # automatically choose labeled subset for implication filter in case of mixed dataset
         if isinstance(data_extractor, LabeledUnlabeledMixed):
@@ -338,7 +342,7 @@ class DisjointLoss(ImplicationLoss):
     def __init__(
         self,
         path_to_disjointness: str,
-        data_extractor: Union[_ChEBIDataExtractor, LabeledUnlabeledMixed],
+        data_extractor: Union[_ChEBIDataExtractor, "LabeledUnlabeledMixed"],
         base_loss: torch.nn.Module = None,
         disjoint_loss_weight: float = 100,
         **kwargs,
 
@@ -11,7 +11,6 @@ class FFN(ChebaiBaseNet):
 
     def __init__(
         self,
-        input_size: int,
         hidden_layers: List[int] = [
             1024,
         ],
@@ -20,7 +19,7 @@ def __init__(
         super().__init__(**kwargs)
 
         layers = []
-        current_layer_input_size = input_size
+        current_layer_input_size = self.input_dim
         for hidden_dim in hidden_layers:
             layers.append(MLPBlock(current_layer_input_size, hidden_dim))
             layers.append(Residual(MLPBlock(hidden_dim, hidden_dim)))
 
@@ -1,24 +1,22 @@
 import os
 import random
 from abc import ABC, abstractmethod
-from typing import Any, Dict, Generator, List, Optional, Tuple, Union
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Tuple, Union
 
 import lightning as pl
-import networkx as nx
 import pandas as pd
 import torch
 import tqdm
-from iterstrat.ml_stratifiers import (
-    MultilabelStratifiedKFold,
-    MultilabelStratifiedShuffleSplit,
-)
 from lightning.pytorch.core.datamodule import LightningDataModule
 from lightning_utilities.core.rank_zero import rank_zero_info
-from sklearn.model_selection import StratifiedShuffleSplit
 from torch.utils.data import DataLoader
 
 from chebai.preprocessing import reader as dr
 
+if TYPE_CHECKING:
+    import networkx as nx
+
 
 class XYBaseDataModule(LightningDataModule):
     """
@@ -419,10 +417,17 @@ def prepare_data(self, *args, **kwargs) -> None:
 
         self._prepare_data_flag += 1
         self._perform_data_preparation(*args, **kwargs)
+        self._after_prepare_data(*args, **kwargs)
 
     def _perform_data_preparation(self, *args, **kwargs) -> None:
         raise NotImplementedError
 
+    def _after_prepare_data(self, *args, **kwargs) -> None:
+        """
+        Hook to perform additional pre-processing after pre-processed data is available.
+        """
+        ...
+
     def setup(self, *args, **kwargs) -> None:
         """
         Setup the data module.
@@ -464,14 +469,17 @@ def _set_processed_data_props(self):
             - self._num_of_labels: Number of target labels in the dataset.
             - self._feature_vector_size: Maximum feature vector length across all data points.
         """
-        data_pt = torch.load(
-            os.path.join(self.processed_dir, self.processed_file_names_dict["data"]),
-            weights_only=False,
+        pt_file_path = os.path.join(
+            self.processed_dir, self.processed_file_names_dict["data"]
         )
+        data_pt = torch.load(pt_file_path, weights_only=False)
 
         self._num_of_labels = len(data_pt[0]["labels"])
         self._feature_vector_size = max(len(d["features"]) for d in data_pt)
 
+        print(
+            f"Number of samples in encoded data ({pt_file_path}): {len(data_pt)} samples"
+        )
         print(f"Number of labels for loaded data: {self._num_of_labels}")
         print(f"Feature vector size: {self._feature_vector_size}")
 
@@ -734,6 +742,7 @@ def __init__(
         self.splits_file_path = self._validate_splits_file_path(
             kwargs.get("splits_file_path", None)
         )
+        self._data_pkl_filename: str = "data.pkl"
 
     @staticmethod
     def _validate_splits_file_path(splits_file_path: Optional[str]) -> Optional[str]:
@@ -818,7 +827,7 @@ def _download_required_data(self) -> str:
         pass
 
     @abstractmethod
-    def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph:
+    def _extract_class_hierarchy(self, data_path: str) -> "nx.DiGraph":
         """
         Extracts the class hierarchy from the data.
         Constructs a directed graph (DiGraph) using NetworkX, where nodes are annotated with fields/terms from
@@ -833,7 +842,7 @@ def _extract_class_hierarchy(self, data_path: str) -> nx.DiGraph:
         pass
 
     @abstractmethod
-    def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame:
+    def _graph_to_raw_dataset(self, graph: "nx.DiGraph") -> pd.DataFrame:
         """
         Converts the graph to a raw dataset.
         Uses the graph created by `_extract_class_hierarchy` method to extract the
@@ -848,7 +857,7 @@ def _graph_to_raw_dataset(self, graph: nx.DiGraph) -> pd.DataFrame:
         pass
 
     @abstractmethod
-    def select_classes(self, g: nx.DiGraph, *args, **kwargs) -> List:
+    def select_classes(self, g: "nx.DiGraph", *args, **kwargs) -> List:
         """
         Selects classes from the dataset based on a specified criteria.
 
@@ -872,6 +881,21 @@ def save_processed(self, data: pd.DataFrame, filename: str) -> None:
         """
         pd.to_pickle(data, open(os.path.join(self.processed_dir_main, filename), "wb"))
 
+    def get_processed_pickled_df_file(self, filename: str) -> Optional[pd.DataFrame]:
+        """
+        Gets the processed dataset pickle file.
+
+        Args:
+            filename (str): The filename for the pickle file.
+
+        Returns:
+            pd.DataFrame: The processed dataset as a DataFrame.
+        """
+        file_path = Path(self.processed_dir_main) / filename
+        if file_path.exists():
+            return pd.read_pickle(file_path)
+        return None
+
     # ------------------------------ Phase: Setup data -----------------------------------
     def setup_processed(self) -> None:
         """
@@ -910,7 +934,9 @@ def _get_data_size(input_file_path: str) -> int:
             int: The size of the data.
         """
         with open(input_file_path, "rb") as f:
-            return len(pd.read_pickle(f))
+            df = pd.read_pickle(f)
+            print(f"Processed data size ({input_file_path}): {len(df)} rows")
+            return len(df)
 
     @abstractmethod
     def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]:
@@ -1023,6 +1049,9 @@ def get_test_split(
         Raises:
             ValueError: If the DataFrame does not contain a column named "labels".
         """
+        from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
+        from sklearn.model_selection import StratifiedShuffleSplit
+
         print("Get test data split")
 
         labels_list = df["labels"].tolist()
@@ -1060,6 +1089,12 @@ def get_train_val_splits_given_test(
                 and validation DataFrames. The keys are the names of the train and validation sets, and the values
                 are the corresponding DataFrames.
         """
+        from iterstrat.ml_stratifiers import (
+            MultilabelStratifiedKFold,
+            MultilabelStratifiedShuffleSplit,
+        )
+        from sklearn.model_selection import StratifiedShuffleSplit
+
         print("Split dataset into train / val with given test set")
 
         test_ids = test_df["ident"].tolist()
@@ -1217,7 +1252,7 @@ def processed_main_file_names_dict(self) -> dict:
             dict: A dictionary mapping dataset key to their respective file names.
                   For example, {"data": "data.pkl"}.
         """
-        return {"data": "data.pkl"}
+        return {"data": self._data_pkl_filename}
 
     @property
     def raw_file_names(self) -> List[str]: