ChEB-AI
diff --git a/‎README.md‎
Lines changed: 15 additions & 2 deletions b/‎README.md‎
Lines changed: 15 additions & 2 deletions
diff --git a/‎chebai/models/ffn.py‎
Lines changed: 19 additions & 1 deletion b/‎chebai/models/ffn.py‎
Lines changed: 19 additions & 1 deletion
diff --git a/‎chebai/preprocessing/datasets/base.py‎
Lines changed: 38 additions & 7 deletions b/‎chebai/preprocessing/datasets/base.py‎
Lines changed: 38 additions & 7 deletions
@@ -78,8 +78,21 @@ The `classes_path` is the path to the dataset's `raw/classes.txt` file that cont
 
 ## Evaluation
 
-An example for evaluating a model trained on the ontology extension task is given in `tutorials/eval_model_basic.ipynb`.
-It takes in the finetuned model as input for performing the evaluation.
+You can evaluate a model trained on the ontology extension task in one of two ways:
+
+### 1. Using the Jupyter Notebook
+An example notebook is provided at `tutorials/eval_model_basic.ipynb`.  
+- Load your finetuned model and run the evaluation cells to compute metrics on the test set.
+
+### 2. Using the Lightning CLI
+Alternatively, you can evaluate the model via the CLI:
+
+```bash
+python -m chebai test --trainer=configs/training/default_trainer.yml --trainer.devices=1 --trainer.num_nodes=1 --ckpt_path=[path-to-finetuned-model] --model=configs/model/electra.yml --model.test_metrics=configs/metrics/micro-macro-f1.yml --data=configs/data/chebi/chebi50.yml --data.init_args.batch_size=32 --data.init_args.num_workers=10 --data.init_args.chebi_version=[chebi-version] --model.pass_loss_kwargs=false --model.criterion=configs/loss/bce.yml --model.criterion.init_args.beta=0.99 --data.init_args.splits_file_path=[path-to-splits-file]
+```
+
+> **Note**: It is recommended to use `devices=1` and `num_nodes=1` during testing; multi-device settings use a `DistributedSampler`, which may replicate some samples to maintain equal batch sizes, so using a single device ensures that each sample or batch is evaluated exactly once.
+
 
 ## Cross-validation
 You can do inner k-fold cross-validation, i.e., train models on k train-validation splits that all use the same test
 
@@ -14,10 +14,14 @@ def __init__(
         hidden_layers: List[int] = [
             1024,
         ],
+        use_adam_optimizer: bool = False,
         **kwargs,
     ):
         super().__init__(**kwargs)
 
+        self.use_adam_optimizer: bool = bool(use_adam_optimizer)
+        print(f"Using Adam optimizer: {self.use_adam_optimizer}")
+
         layers = []
         current_layer_input_size = self.input_dim
         for hidden_dim in hidden_layers:
@@ -26,7 +30,6 @@ def __init__(
             current_layer_input_size = hidden_dim
 
         layers.append(torch.nn.Linear(current_layer_input_size, self.out_dim))
-        layers.append(nn.Sigmoid())
         self.model = nn.Sequential(*layers)
 
     def _get_prediction_and_labels(self, data, labels, model_output):
@@ -63,6 +66,21 @@ def forward(self, data, **kwargs):
         x = data["features"]
         return {"logits": self.model(x)}
 
+    def configure_optimizers(self, **kwargs) -> torch.optim.Optimizer:
+        """
+        Configures the optimizers.
+
+        Args:
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            torch.optim.Optimizer: The optimizer.
+        """
+        if self.use_adam_optimizer:
+            return torch.optim.Adam(self.parameters(), **self.optimizer_kwargs)
+
+        return torch.optim.Adamax(self.parameters(), **self.optimizer_kwargs)
+
 
 class Residual(nn.Module):
     """
 
@@ -1,6 +1,7 @@
 import os
 import random
 from abc import ABC, abstractmethod
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Tuple, Union
 
 import lightning as pl
@@ -76,6 +77,7 @@ def __init__(
         label_filter: Optional[int] = None,
         balance_after_filter: Optional[float] = None,
         num_workers: int = 1,
+        persistent_workers: bool = True,
         chebi_version: int = 200,
         inner_k_folds: int = -1,  # use inner cross-validation if > 1
         fold_index: Optional[int] = None,
@@ -99,6 +101,7 @@ def __init__(
         ), "Filter balancing requires a filter"
         self.balance_after_filter = balance_after_filter
         self.num_workers = num_workers
+        self.persistent_workers: bool = bool(persistent_workers)
         self.chebi_version = chebi_version
         assert type(inner_k_folds) is int
         self.inner_k_folds = inner_k_folds
@@ -363,7 +366,7 @@ def train_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader
             "train",
             shuffle=True,
             num_workers=self.num_workers,
-            persistent_workers=True,
+            persistent_workers=self.persistent_workers,
             **kwargs,
         )
 
@@ -382,7 +385,7 @@ def val_dataloader(self, *args, **kwargs) -> Union[DataLoader, List[DataLoader]]
             "validation",
             shuffle=False,
             num_workers=self.num_workers,
-            persistent_workers=True,
+            persistent_workers=self.persistent_workers,
             **kwargs,
         )
 
@@ -420,10 +423,17 @@ def prepare_data(self, *args, **kwargs) -> None:
 
         self._prepare_data_flag += 1
         self._perform_data_preparation(*args, **kwargs)
+        self._after_prepare_data(*args, **kwargs)
 
     def _perform_data_preparation(self, *args, **kwargs) -> None:
         raise NotImplementedError
 
+    def _after_prepare_data(self, *args, **kwargs) -> None:
+        """
+        Hook to perform additional pre-processing after pre-processed data is available.
+        """
+        ...
+
     def setup(self, *args, **kwargs) -> None:
         """
         Setup the data module.
@@ -466,14 +476,17 @@ def _set_processed_data_props(self):
             - self._num_of_labels: Number of target labels in the dataset.
             - self._feature_vector_size: Maximum feature vector length across all data points.
         """
-        data_pt = torch.load(
-            os.path.join(self.processed_dir, self.processed_file_names_dict["data"]),
-            weights_only=False,
+        pt_file_path = os.path.join(
+            self.processed_dir, self.processed_file_names_dict["data"]
         )
+        data_pt = torch.load(pt_file_path, weights_only=False)
 
         self._num_of_labels = len(data_pt[0]["labels"])
         self._feature_vector_size = max(len(d["features"]) for d in data_pt)
 
+        print(
+            f"Number of samples in encoded data ({pt_file_path}): {len(data_pt)} samples"
+        )
         print(f"Number of labels for loaded data: {self._num_of_labels}")
         print(f"Feature vector size: {self._feature_vector_size}")
 
@@ -747,6 +760,7 @@ def __init__(
         )
         self.apply_label_filter = apply_label_filter
         self.apply_id_filter = apply_id_filter
+        self._data_pkl_filename: str = "data.pkl"
 
     @staticmethod
     def _validate_splits_file_path(splits_file_path: Optional[str]) -> Optional[str]:
@@ -885,6 +899,21 @@ def save_processed(self, data: pd.DataFrame, filename: str) -> None:
         """
         pd.to_pickle(data, open(os.path.join(self.processed_dir_main, filename), "wb"))
 
+    def get_processed_pickled_df_file(self, filename: str) -> Optional[pd.DataFrame]:
+        """
+        Gets the processed dataset pickle file.
+
+        Args:
+            filename (str): The filename for the pickle file.
+
+        Returns:
+            pd.DataFrame: The processed dataset as a DataFrame.
+        """
+        file_path = Path(self.processed_dir_main) / filename
+        if file_path.exists():
+            return pd.read_pickle(file_path)
+        return None
+
     # ------------------------------ Phase: Setup data -----------------------------------
     def setup_processed(self) -> None:
         """
@@ -923,7 +952,9 @@ def _get_data_size(input_file_path: str) -> int:
             int: The size of the data.
         """
         with open(input_file_path, "rb") as f:
-            return len(pd.read_pickle(f))
+            df = pd.read_pickle(f)
+            print(f"Processed data size ({input_file_path}): {len(df)} rows")
+            return len(df)
 
     @abstractmethod
     def _load_dict(self, input_file_path: str) -> Generator[Dict[str, Any], None, None]:
@@ -1260,7 +1291,7 @@ def processed_main_file_names_dict(self) -> dict:
             dict: A dictionary mapping dataset key to their respective file names.
                   For example, {"data": "data.pkl"}.
         """
-        return {"data": "data.pkl"}
+        return {"data": self._data_pkl_filename}
 
     @property
     def raw_file_names(self) -> List[str]: