seperate method for evaluate and prediction

aditya0by0 · aditya0by0 · commit 95d49c1f65e7 · 2025-06-01T16:29:40.000+02:00
diff --git a/chebai/ensemble/_base.py b/chebai/ensemble/_base.py
@@ -1,15 +1,21 @@
 from abc import ABC, abstractmethod
 from collections import deque
 from pathlib import Path
-from typing import Any, Deque, Dict, Optional
+from typing import Any, Deque, Dict, Literal, Optional
 
 import pandas as pd
 import torch
 from lightning import LightningModule
 
 from chebai.result.classification import print_metrics
 
-from ._constants import MODEL_CLS_PATH, MODEL_LBL_PATH, WRAPPER_CLS_PATH
+from ._constants import (
+    EVAL_OP,
+    MODEL_CLS_PATH,
+    MODEL_LBL_PATH,
+    PRED_OP,
+    WRAPPER_CLS_PATH,
+)
 
 
 class EnsembleBase(ABC):
@@ -22,38 +28,40 @@ class EnsembleBase(ABC):
     def __init__(
         self,
         model_configs: Dict[str, Dict[str, Any]],
-        data_file_path: str,
-        classes_file_path: str,
+        data_processed_dir_main: str,
+        operation: str = EVAL_OP,
         **kwargs: Any,
     ) -> None:
         """
         Initializes the ensemble model and loads configurations, labels, and sets up the environment.
 
         Args:
             model_configs (Dict[str, Dict[str, Any]]): Dictionary of model configurations.
-            data_file_path (str): Path to the processed data directory.
+            data_processed_dir_main (str): Path to the processed data directory.
             reader_dir_name (str): Name of the directory used by the reader. Defaults to 'smiles_token'.
             **kwargs (Any): Additional arguments, such as 'input_dim' and '_validate_configs'.
         """
-        if bool(kwargs.get("_validate_configs", True)):
-            self._validate_model_configs(model_configs)
+        if bool(kwargs.get("_perform_validation_checks", True)):
+            self._perform_validation_checks(
+                model_configs, operation=operation, **kwargs
+            )
 
         self._model_configs: Dict[str, Dict[str, Any]] = model_configs
-        self._data_file_path: str = data_file_path
-        self._classes_file_path: str = classes_file_path
+        self._data_processed_dir_main: str = data_processed_dir_main
+        self._operation: str = operation
+        print(f"Ensemble operation: {self._operation}")
+
         self._input_dim: Optional[int] = kwargs.get("input_dim", None)
         self._total_data_size: int = None
         self._ensemble_input: list[str] | Path = self._process_input_to_ensemble(
-            data_file_path
+            **kwargs
         )
         print(f"Total data size (data.pkl) is {self._total_data_size}")
 
         self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
         self._models: Dict[str, LightningModule] = {}
-        self._dm_labels: Dict[str, int] = self._load_data_module_labels(
-            classes_file_path
-        )
+        self._dm_labels: Dict[str, int] = self._load_data_module_labels()
         self._num_of_labels: int = len(self._dm_labels)
         print(f"Number of labes for this data is {self._num_of_labels} ")
 
@@ -63,7 +71,9 @@ def __init__(
         self._model_queue: Deque[str] = deque()
 
     @classmethod
-    def _validate_model_configs(cls, model_configs: Dict[str, Dict[str, Any]]) -> None:
+    def _perform_validation_checks(
+        cls, model_configs: Dict[str, Dict[str, Any]], operation, **kwargs
+    ) -> None:
         """
         Validates model configuration dictionary for required keys and uniqueness.
 
@@ -74,6 +84,19 @@ def _validate_model_configs(cls, model_configs: Dict[str, Dict[str, Any]]) -> No
             AttributeError: If any model config is missing required keys.
             ValueError: If duplicate paths are found for model checkpoint, class, or labels.
         """
+        if operation not in ["evaluate", "predict"]:
+            raise ValueError(
+                f"Invalid operation '{operation}'. Must be 'evaluate' or 'predict'."
+            )
+
+        if operation == "predict" and not kwargs.get("smiles_list_file_path", None):
+            raise ValueError(
+                "For 'predict' operation, 'smiles_list_file_path' must be provided."
+            )
+
+        if not Path(kwargs.get("smiles_list_file_path")).exists():
+            raise FileNotFoundError(f"{kwargs.get('smiles_list_file_path')}")
+
         class_set, labels_set = set(), set()
         required_keys = {
             MODEL_CLS_PATH,
@@ -103,9 +126,9 @@ def _validate_model_configs(cls, model_configs: Dict[str, Dict[str, Any]]) -> No
             class_set.add(model_class_path)
             labels_set.add(model_labels_path)
 
-    def _process_input_to_ensemble(self, path: str):
-        p = Path(path)
-        if p.is_file():
+    def _process_input_to_ensemble(self, **kwargs: any) -> list[str] | Path:
+        if self._operation == PRED_OP:
+            p = Path(kwargs["smiles_list_file_path"])
             smiles_list = []
             with open(p, "r") as f:
                 for line in f:
@@ -116,24 +139,23 @@ def _process_input_to_ensemble(self, path: str):
                         smiles_list.append(smiles)
             self._total_data_size = len(smiles_list)
             return smiles_list
-        elif p.is_dir():
-            data_pkl_path = p / "data.pkl"
+        elif self._operation == EVAL_OP:
+            data_pkl_path = Path(self._data_processed_dir_main) / "data.pkl"
             if not data_pkl_path.exists():
                 raise FileNotFoundError()
             self._total_data_size = len(pd.read_pickle(data_pkl_path))
             return p
         else:
-            raise "Invalid path"
+            raise ValueError("Invalid operation")
 
-    @staticmethod
-    def _load_data_module_labels(classes_file_path: str) -> dict[str, int]:
+    def _load_data_module_labels(self) -> dict[str, int]:
         """
         Loads class labels from the classes.txt file and sets internal label mapping.
 
         Raises:
             FileNotFoundError: If the expected classes.txt file is not found.
         """
-        classes_file_path = Path(classes_file_path)
+        classes_file_path = Path(self._data_processed_dir_main) / "classes.txt"
         if not classes_file_path.exists():
             raise FileNotFoundError(f"{classes_file_path} does not exist")
         print(f"Loading {classes_file_path} ....")
@@ -197,14 +219,13 @@ def _controller(
         Returns:
             Dict[str, torch.Tensor]: Predictions or confidence scores.
         """
-        pass
 
     @abstractmethod
     def _consolidator(
         self,
+        *,
         pred_conf_dict: Dict[str, torch.Tensor],
         model_props: Dict[str, torch.Tensor],
-        *,
         true_scores: torch.Tensor,
         false_scores: torch.Tensor,
         **kwargs: Any,
@@ -214,7 +235,6 @@ def _consolidator(
 
         Should update the provided `true_scores` and `false_scores`.
         """
-        pass
 
     @abstractmethod
     def _consolidate_on_finish(
@@ -226,4 +246,3 @@ def _consolidate_on_finish(
         Returns:
             torch.Tensor: Final aggregated predictions.
         """
-        pass
diff --git a/chebai/ensemble/_constants.py b/chebai/ensemble/_constants.py
@@ -6,3 +6,7 @@
 
 READER_CLS_PATH = "reader_class_path"
 READER_KWARGS = "reader_kwargs"
+
+
+PRED_OP = "prediction"
+EVAL_OP = "evaluation"
diff --git a/chebai/ensemble/_controller.py b/chebai/ensemble/_controller.py
@@ -1,14 +1,12 @@
-from abc import ABC
+from abc import ABC, abstractmethod
 from collections import deque
 from typing import Any, Deque, Dict
 
 import torch
 from torch import Tensor
 
-from chebai.models import ChebaiBaseNet
-
 from ._base import EnsembleBase
-from ._constants import WRAPPER_CLS_PATH
+from ._constants import EVAL_OP, PRED_OP, WRAPPER_CLS_PATH
 from ._utils import _load_class
 from ._wrappers import BaseWrapper
 
@@ -33,6 +31,30 @@ def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
         self._kwargs = kwargs
 
+    @abstractmethod
+    def _controller(self, model_name, model_input, **kwargs: Any) -> Dict[str, Tensor]:
+        """
+        Performs inference with the model and extracts predictions and confidence values.
+
+        Args:
+            model (ChebaiBaseNet): The model to perform inference with.
+            model_props (Dict[str, Tensor]): Dictionary with label mask and trust scores.
+
+        Returns:
+            Dict[str, Tensor]: Dictionary containing predictions and confidence scores.
+        """
+        wrapped_model = self._wrap_model(model_name)
+        if self._operation == PRED_OP:
+            model_output, model_props = wrapped_model.predict(model_input)
+        else:
+            model_output, model_props = wrapped_model.evaluate(model_input)
+        del wrapped_model  # Model can be huge to keep it in memory, delete asap as no longer needed
+
+        pred_conf_dict = self._get_pred_conf_from_model_output(
+            model_output, model_props["mask"]
+        )
+        return {"pred_conf_dict": pred_conf_dict, "model_props": model_props}
+
     def _get_pred_conf_from_model_output(
         self, model_output: Dict[str, Tensor], model_label_mask: Tensor
     ) -> Dict[str, Tensor]:
@@ -100,10 +122,7 @@ def _controller(self, model_name, model_input, **kwargs: Any) -> Dict[str, Tenso
         Returns:
             Dict[str, Tensor]: Dictionary containing predictions and confidence scores.
         """
-        wrapped_model = self._wrap_model(model_name)
-        model_output, model_props = wrapped_model.predict(model_input)
-        del wrapped_model  # Model can be huge to keep it in memory, delete asap as no longer needed
-        pred_conf_dict = self._get_pred_conf_from_model_output(
-            model_output, model_props["mask"]
-        )
-        return {"pred_conf_dict": pred_conf_dict, "model_props": model_props}
+
+        output_dict = super()._controller(model_name, model_input, **kwargs)
+        # Some activation condition can be applied, not in this controller, so we return the output directly
+        return output_dict
diff --git a/chebai/ensemble/_wrappers/_base.py b/chebai/ensemble/_wrappers/_base.py
@@ -114,13 +114,22 @@ def name(self):
         return f"Wrapper({self.__class__.__name__}) for model: {self._model_name}"
 
     def predict(self, x: list) -> tuple[dict, dict]:
+        if not isinstance(x, list):
+            raise TypeError(f"Input must be a list of SMILES strings, got {type(x)}")
         return self._predict_from_list_of_smiles(x), self._model_props
 
     @abstractmethod
     def _predict_from_list_of_smiles(self, smiles_list: list) -> dict: ...
 
-    def evaluate(self, data_processed_dir_main: Path) -> tuple[dict, dict]:
-        return self._evaluate_from_data_file(data_processed_dir_main), self._model_props
+    def evaluate(
+        self, data_processed_dir_main: Path, **kwargs: any
+    ) -> tuple[dict, dict]:
+        if not data_processed_dir_main.is_dir():
+            raise NotADirectoryError(f"{data_processed_dir_main} is not a directory.")
+        return (
+            self._evaluate_from_data_file(data_processed_dir_main, **kwargs),
+            self._model_props,
+        )
 
     @abstractmethod
     def _evaluate_from_data_file(self, data_file_path: str) -> dict: ...
diff --git a/chebai/ensemble/_wrappers/_neural_network.py b/chebai/ensemble/_wrappers/_neural_network.py
@@ -1,4 +1,5 @@
 import os
+from pathlib import Path
 from typing import Type
 
 import torch
@@ -90,13 +91,18 @@ def _load_model_(self, input_dim: int | None) -> ChebaiBaseNet:
                 self._model_ckpt_path, input_dim=5
             )
         except Exception as e:
-            raise RuntimeError(f"Error loading model {self._model_name} \n Error: {e}")
+            raise RuntimeError(
+                f"Error loading model {self._model_name} \n Error: {e}"
+            ) from e
 
+        assert isinstance(
+            model, ChebaiBaseNet
+        ), f"{model} is not a ChebaiBaseNet instance."
         model.eval()
         model.freeze()
         return model
 
-    def _predict_from_list_of_smiles(self, smiles_list) -> list:
+    def _predict_from_list_of_smiles(self, smiles_list: list[str]) -> list:
         token_dicts = []
         could_not_parse = []
         index_map = dict()
@@ -131,16 +137,16 @@ def _read_smiles(self, smiles):
         return self._reader.to_data(dict(features=smiles, labels=None))
 
     def _forward_pass(self, batch):
-        processable_data = self._model._process_batch(
+        processable_data = self._model._process_batch(  # noqa
             self._collator(batch).to(self._device), 0
         )
         return self._model(processable_data, **processable_data["model_kwargs"])
 
-    def _predict_from_data_file(
-        self, processed_dir_main: str, data_file_name="data.pt"
+    def _evaluate_from_data_file(
+        self, data_processed_dir_main: Path, data_file_name="data.pt"
     ) -> list:
         data = torch.load(
-            os.path.join(processed_dir_main, self._reader.name(), data_file_name),
+            data_processed_dir_main / self._reader.name() / data_file_name,
             weights_only=False,
             map_location=self._device,
         )