predict method implementation for data file and list of smiles

aditya0by0 · aditya0by0 · commit 76d8a79c8e62 · 2025-06-01T15:30:56.000+02:00
diff --git a/chebai/ensemble/_base.py b/chebai/ensemble/_base.py
@@ -1,26 +1,15 @@
-import importlib
-import json
-import os
 from abc import ABC, abstractmethod
 from collections import deque
-from typing import Any, Deque, Dict, Optional, Tuple
+from pathlib import Path
+from typing import Any, Deque, Dict, Optional
 
+import pandas as pd
 import torch
 from lightning import LightningModule
 
-from chebai.models import ChebaiBaseNet
-from chebai.preprocessing.structures import XYData
 from chebai.result.classification import print_metrics
 
-from ._constants import (
-    MODEL_CKPT_PATH,
-    MODEL_CLS_PATH,
-    MODEL_LBL_PATH,
-    READER_CLS_PATH,
-    WRAPPER_CLS_PATH,
-)
-from ._utils import _load_class
-from ._wrappers import BaseWrapper
+from ._constants import MODEL_CLS_PATH, MODEL_LBL_PATH, WRAPPER_CLS_PATH
 
 
 class EnsembleBase(ABC):
@@ -33,38 +22,45 @@ class EnsembleBase(ABC):
     def __init__(
         self,
         model_configs: Dict[str, Dict[str, Any]],
-        data_processed_dir_main: str,
+        data_file_path: str,
+        classes_file_path: str,
         **kwargs: Any,
     ) -> None:
         """
         Initializes the ensemble model and loads configurations, labels, and sets up the environment.
 
         Args:
             model_configs (Dict[str, Dict[str, Any]]): Dictionary of model configurations.
-            data_processed_dir_main (str): Path to the processed data directory.
+            data_file_path (str): Path to the processed data directory.
             reader_dir_name (str): Name of the directory used by the reader. Defaults to 'smiles_token'.
             **kwargs (Any): Additional arguments, such as 'input_dim' and '_validate_configs'.
         """
         if bool(kwargs.get("_validate_configs", True)):
             self._validate_model_configs(model_configs)
 
         self._model_configs: Dict[str, Dict[str, Any]] = model_configs
-        self._data_processed_dir_main: str = data_processed_dir_main
+        self._data_file_path: str = data_file_path
+        self._classes_file_path: str = classes_file_path
         self._input_dim: Optional[int] = kwargs.get("input_dim", None)
-        self._total_data_size: int = len(self._collated_data)
+        self._total_data_size: int = None
+        self._ensemble_input: list[str] | Path = self._process_input_to_ensemble(
+            data_file_path
+        )
+        print(f"Total data size (data.pkl) is {self._total_data_size}")
 
         self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
         self._models: Dict[str, LightningModule] = {}
-        self._dm_labels: Dict[str, int] = self._load_data_module_labels()
+        self._dm_labels: Dict[str, int] = self._load_data_module_labels(
+            classes_file_path
+        )
         self._num_of_labels: int = len(self._dm_labels)
+        print(f"Number of labes for this data is {self._num_of_labels} ")
 
         self._num_models_per_label: torch.Tensor = torch.zeros(
             1, self._num_of_labels, device=self._device
         )
         self._model_queue: Deque[str] = deque()
-        self._collated_data: Optional[XYData] = None
-        self._total_data_size: Optional[int] = None
 
     @classmethod
     def _validate_model_configs(cls, model_configs: Dict[str, Dict[str, Any]]) -> None:
@@ -107,21 +103,43 @@ def _validate_model_configs(cls, model_configs: Dict[str, Dict[str, Any]]) -> No
             class_set.add(model_class_path)
             labels_set.add(model_labels_path)
 
-    def _load_data_module_labels(self) -> dict[str, int]:
+    def _process_input_to_ensemble(self, path: str):
+        p = Path(path)
+        if p.is_file():
+            smiles_list = []
+            with open(p, "r") as f:
+                for line in f:
+                    # Skip empty or whitespace-only lines
+                    if line.strip():
+                        # Split on whitespace and take the first item as the SMILES
+                        smiles = line.strip().split()[0]
+                        smiles_list.append(smiles)
+            self._total_data_size = len(smiles_list)
+            return smiles_list
+        elif p.is_dir():
+            data_pkl_path = p / "data.pkl"
+            if not data_pkl_path.exists():
+                raise FileNotFoundError()
+            self._total_data_size = len(pd.read_pickle(data_pkl_path))
+            return p
+        else:
+            raise "Invalid path"
+
+    @staticmethod
+    def _load_data_module_labels(classes_file_path: str) -> dict[str, int]:
         """
         Loads class labels from the classes.txt file and sets internal label mapping.
 
         Raises:
             FileNotFoundError: If the expected classes.txt file is not found.
         """
-        classes_txt_file = os.path.join(self._data_processed_dir_main, "classes.txt")
-        print(f"Loading {classes_txt_file} ....")
-
-        if not os.path.exists(classes_txt_file):
-            raise FileNotFoundError(f"{classes_txt_file} does not exist")
+        classes_file_path = Path(classes_file_path)
+        if not classes_file_path.exists():
+            raise FileNotFoundError(f"{classes_file_path} does not exist")
+        print(f"Loading {classes_file_path} ....")
 
         dm_labels_dict = {}
-        with open(classes_txt_file, "r") as f:
+        with open(classes_file_path, "r") as f:
             for line in f:
                 label = line.strip()
                 if label not in dm_labels_dict:
@@ -132,6 +150,7 @@ def run_ensemble(self) -> None:
         """
         Executes the full ensemble prediction pipeline, aggregating predictions and printing metrics.
         """
+        assert self._total_data_size is not None and self._num_of_labels is not None
         true_scores = torch.zeros(
             self._total_data_size, self._num_of_labels, device=self._device
         )
@@ -144,12 +163,12 @@ def run_ensemble(self) -> None:
             print(f"Processing model: {model_name}")
 
             print("\t Passing model to controller to generate predictions...")
-            pred_conf_dict, model_props = self._controller(model_name)
+            controller_output = self._controller(model_name, self._ensemble_input)
 
             print("\t Passing predictions to consolidator for aggregation...")
             self._consolidator(
-                pred_conf_dict,
-                model_props,
+                pred_conf_dict=controller_output["pred_conf_dict"],
+                model_props=controller_output["model_props"],
                 true_scores=true_scores,
                 false_scores=false_scores,
             )
@@ -168,8 +187,8 @@ def run_ensemble(self) -> None:
     @abstractmethod
     def _controller(
         self,
-        model: LightningModule,
-        model_props: Dict[str, torch.Tensor],
+        model_name: str,
+        model_input: list[str] | Path,
         **kwargs: Any,
     ) -> Dict[str, torch.Tensor]:
         """
diff --git a/chebai/ensemble/_controller.py b/chebai/ensemble/_controller.py
@@ -1,14 +1,11 @@
-import os.path
 from abc import ABC
 from collections import deque
 from typing import Any, Deque, Dict
 
 import torch
-from lightning import LightningModule
 from torch import Tensor
 
 from chebai.models import ChebaiBaseNet
-from chebai.preprocessing.collate import RaggedCollator
 
 from ._base import EnsembleBase
 from ._constants import WRAPPER_CLS_PATH
@@ -72,7 +69,6 @@ def _wrap_model(self, model_name: str) -> BaseWrapper:
             **self._kwargs
         )
         assert isinstance(wrapped_model, BaseWrapper), ""
-        # del wrapped_model  # Model can be huge to keep it in memory, delete as no longer needed
         return wrapped_model
 
 
@@ -93,7 +89,7 @@ def __init__(self, **kwargs: Any):
         super().__init__(**kwargs)
         self._model_queue: Deque[str] = deque(list(self._model_configs.keys()))
 
-    def _controller(self, model_name, **kwargs: Any) -> Dict[str, Tensor]:
+    def _controller(self, model_name, model_input, **kwargs: Any) -> Dict[str, Tensor]:
         """
         Performs inference with the model and extracts predictions and confidence values.
 
@@ -105,4 +101,9 @@ def _controller(self, model_name, **kwargs: Any) -> Dict[str, Tensor]:
             Dict[str, Tensor]: Dictionary containing predictions and confidence scores.
         """
         wrapped_model = self._wrap_model(model_name)
-        return self._get_pred_conf_from_model_output(model_output, model_props["mask"])
+        model_output, model_props = wrapped_model.predict(model_input)
+        del wrapped_model  # Model can be huge to keep it in memory, delete asap as no longer needed
+        pred_conf_dict = self._get_pred_conf_from_model_output(
+            model_output, model_props["mask"]
+        )
+        return {"pred_conf_dict": pred_conf_dict, "model_props": model_props}
diff --git a/chebai/ensemble/_wrappers/_base.py b/chebai/ensemble/_wrappers/_base.py
@@ -1,8 +1,7 @@
-import importlib
 import json
 import os
 from abc import ABC, abstractmethod
-from typing import overload
+from pathlib import Path
 
 import torch
 
@@ -22,10 +21,9 @@ def __init__(
         self._model_name = model_name
         self._model_class_path = self._model_config[MODEL_CLS_PATH]
         self._model_labels_path = self._model_config[MODEL_LBL_PATH]
-        self._dm_labels: dict[str, int] = dm_labels
-        self._model_props = self._generate_model_label_props()
+        self._model_props = self._generate_model_label_props(dm_labels=dm_labels)
 
-    def _generate_model_label_props(self) -> dict[str, torch.Tensor]:
+    def _generate_model_label_props(self, dm_labels) -> dict[str, torch.Tensor]:
         """
         Generates label mask and confidence tensors (TPV, FPV) for a model.
 
@@ -38,13 +36,15 @@ def _generate_model_label_props(self) -> dict[str, torch.Tensor]:
         model_label_indices, tpv_label_values, fpv_label_values = [], [], []
 
         for label, props in labels_dict.items():
-            if label in self._dm_labels:
+            if label in dm_labels:
                 try:
                     self._validate_model_labels_json_element(labels_dict[label])
                 except Exception as e:
-                    raise Exception(f"Label '{label}' has an unexpected error") from e
+                    raise Exception(
+                        f"Label '{label}' has an unexpected error \n Error: {e}"
+                    )
 
-                model_label_indices.append(self._dm_labels[label])
+                model_label_indices.append(dm_labels[label])
                 tpv_label_values.append(props["TPV"])
                 fpv_label_values.append(props["FPV"])
 
@@ -54,7 +54,7 @@ def _generate_model_label_props(self) -> dict[str, torch.Tensor]:
             )
 
         # Create masks to apply predictions only to known classes
-        mask = torch.zeros(len(self._dm_labels), dtype=torch.bool, device=self._device)
+        mask = torch.zeros(len(dm_labels), dtype=torch.bool, device=self._device)
         mask[torch.tensor(model_label_indices, device=self._device)] = True
 
         tpv_tensor = torch.full_like(mask, -1, dtype=torch.float, device=self._device)
@@ -113,26 +113,14 @@ def _validate_model_labels_json_element(label_dict: dict[str, float]) -> None:
     def name(self):
         return f"Wrapper({self.__class__.__name__}) for model: {self._model_name}"
 
-    @overload
-    def predict(self, smiles_list: list) -> tuple[dict, dict]:
-        pass
-
-    @overload
-    def predict(self, data_file_path: str) -> tuple[dict, dict]:
-        pass
-
-    def predict(self, x: list | str) -> tuple[dict, dict]:
-        if isinstance(x, list):
-            return self._predict_from_list_of_smiles(x), self._model_props
-        elif isinstance(x, str):
-            return self._predict_from_data_file(x), self._model_props
-        else:
-            raise TypeError(f"Type {type(x)} is not supported.")
+    def predict(self, x: list) -> tuple[dict, dict]:
+        return self._predict_from_list_of_smiles(x), self._model_props
 
     @abstractmethod
-    def _predict_from_list_of_smiles(self, smiles_list: list) -> dict:
-        pass
+    def _predict_from_list_of_smiles(self, smiles_list: list) -> dict: ...
+
+    def evaluate(self, data_processed_dir_main: Path) -> tuple[dict, dict]:
+        return self._evaluate_from_data_file(data_processed_dir_main), self._model_props
 
     @abstractmethod
-    def _predict_from_data_file(self, data_file_path: str) -> dict:
-        pass
+    def _evaluate_from_data_file(self, data_file_path: str) -> dict: ...
diff --git a/chebai/ensemble/_wrappers/_neural_network.py b/chebai/ensemble/_wrappers/_neural_network.py
@@ -1,5 +1,5 @@
 import os
-from typing import Optional, Type
+from typing import Type
 
 import torch
 from rdkit import Chem
@@ -15,7 +15,9 @@
 class NNWrapper(BaseWrapper):
 
     def __init__(self, **kwargs):
-        self._validate_model_configs(**kwargs)
+        self._validate_model_configs(
+            model_config=kwargs["model_config"], model_name=kwargs["model_name"]
+        )
         super().__init__(**kwargs)
 
         self._model_ckpt_path = self._model_config[MODEL_CKPT_PATH]
@@ -30,11 +32,15 @@ def __init__(self, **kwargs):
         assert issubclass(reader_cls, DataReader), ""
         self._reader = reader_cls(**self._reader_kwargs)
         self._collator = reader_cls.COLLATOR()
-        self._model: ChebaiBaseNet = self._load_model_()
+        self._model: ChebaiBaseNet = self._load_model_(
+            input_dim=kwargs.get("input_dim", None)
+        )
 
     @classmethod
     def _validate_model_configs(
-        cls, model_config: dict[str, str], model_name: str
+        cls,
+        model_config: dict[str, str],
+        model_name: str,
     ) -> None:
         """
         Validates model configuration dictionary for required keys and uniqueness.
@@ -57,12 +63,12 @@ def _validate_model_configs(
                 f"Missing keys {missing_keys} in model '{model_name}' configuration."
             )
 
-    def _load_model_(self) -> ChebaiBaseNet:
+    def _load_model_(self, input_dim: int | None) -> ChebaiBaseNet:
         """
         Loads a model checkpoint and its label-related properties.
 
         Args:
-            model_name (str): Name of the model to load.
+            input_dim (int): Name of the model to load.
 
         Returns:
             Tuple[LightningModule, Dict[str, torch.Tensor]]: The model and its label properties.
@@ -73,22 +79,21 @@ def _load_model_(self) -> ChebaiBaseNet:
                 f"Model path '{self._model_ckpt_path}' for '{self._model_name}' does not exist."
             )
 
-        lightning_cls = self._load_class(self._model_class_path)
+        lightning_cls = _load_class(self._model_class_path)
 
         assert isinstance(lightning_cls, type), f"{lightning_cls} is not a class."
         assert issubclass(
             lightning_cls, ChebaiBaseNet
         ), f"{lightning_cls} must inherit from ChebaiBaseNet"
-
         try:
             model = lightning_cls.load_from_checkpoint(
-                self._model_ckpt_path, input_dim=self.input_dim
+                self._model_ckpt_path, input_dim=5
             )
-            model.eval()
-            model.freeze()
         except Exception as e:
-            raise RuntimeError(f"Error loading model {self._model_name}") from e
+            raise RuntimeError(f"Error loading model {self._model_name} \n Error: {e}")
 
+        model.eval()
+        model.freeze()
         return model
 
     def _predict_from_list_of_smiles(self, smiles_list) -> list: