store collated label or any model in instance var

aditya0by0 · aditya0by0 · commit a20ce7626efd · 2025-06-01T20:45:14.000+02:00
diff --git a/chebai/ensemble/_base.py b/chebai/ensemble/_base.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from collections import deque
 from pathlib import Path
-from typing import Any, Deque, Dict, Literal, Optional
+from typing import Any, Deque, Dict, Optional
 
 import pandas as pd
 import torch
@@ -38,7 +38,6 @@ def __init__(
         Args:
             model_configs (Dict[str, Dict[str, Any]]): Dictionary of model configurations.
             data_processed_dir_main (str): Path to the processed data directory.
-            reader_dir_name (str): Name of the directory used by the reader. Defaults to 'smiles_token'.
             **kwargs (Any): Additional arguments, such as 'input_dim' and '_validate_configs'.
         """
         if bool(kwargs.get("_perform_validation_checks", True)):
@@ -51,16 +50,15 @@ def __init__(
         self._operation: str = operation
         print(f"Ensemble operation: {self._operation}")
 
-        self._input_dim: Optional[int] = kwargs.get("input_dim", None)
-        self._total_data_size: int = None
+        # These instance variable will be set in method `_process_input_to_ensemble`
+        self._total_data_size: int | None = None
         self._ensemble_input: list[str] | Path = self._process_input_to_ensemble(
             **kwargs
         )
         print(f"Total data size (data.pkl) is {self._total_data_size}")
 
         self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
-        self._models: Dict[str, LightningModule] = {}
         self._dm_labels: Dict[str, int] = self._load_data_module_labels()
         self._num_of_labels: int = len(self._dm_labels)
         print(f"Number of labes for this data is {self._num_of_labels} ")
@@ -69,6 +67,7 @@ def __init__(
             1, self._num_of_labels, device=self._device
         )
         self._model_queue: Deque[str] = deque()
+        self._collated_labels: torch.Tensor | None = None
 
     @classmethod
     def _perform_validation_checks(
@@ -126,10 +125,10 @@ def _perform_validation_checks(
             class_set.add(model_class_path)
             labels_set.add(model_labels_path)
 
-    def _process_input_to_ensemble(self, **kwargs: any) -> list[str] | Path:
+    def _process_input_to_ensemble(self, **kwargs: Any) -> list[str] | Path:
         if self._operation == PRED_OP:
             p = Path(kwargs["smiles_list_file_path"])
-            smiles_list = []
+            smiles_list: list[str] = []
             with open(p, "r") as f:
                 for line in f:
                     # Skip empty or whitespace-only lines
@@ -140,11 +139,14 @@ def _process_input_to_ensemble(self, **kwargs: any) -> list[str] | Path:
             self._total_data_size = len(smiles_list)
             return smiles_list
         elif self._operation == EVAL_OP:
-            data_pkl_path = Path(self._data_processed_dir_main) / "data.pkl"
+            processed_dir_path = Path(self._data_processed_dir_main)
+            data_pkl_path = processed_dir_path / "data.pkl"
             if not data_pkl_path.exists():
-                raise FileNotFoundError()
+                raise FileNotFoundError(
+                    f"data.pkl does not exist in the {processed_dir_path} directory"
+                )
             self._total_data_size = len(pd.read_pickle(data_pkl_path))
-            return p
+            return processed_dir_path
         else:
             raise ValueError("Invalid operation")
 
@@ -180,6 +182,9 @@ def run_ensemble(self) -> None:
             self._total_data_size, self._num_of_labels, device=self._device
         )
 
+        print(
+            f"Running {self.__class__.__name__} ensemble for {self._operation} operation..."
+        )
         while self._model_queue:
             model_name = self._model_queue.popleft()
             print(f"Processing model: {model_name}")
@@ -195,16 +200,17 @@ def run_ensemble(self) -> None:
                 false_scores=false_scores,
             )
 
-        print(f"Consolidating predictions for {self.__class__.__name__}")
         final_preds = self._consolidate_on_finish(
             true_scores=true_scores, false_scores=false_scores
         )
-        print_metrics(
-            final_preds,
-            self._collated_data.y,
-            self._device,
-            classes=list(self._dm_labels.keys()),
-        )
+
+        if self._operation == EVAL_OP:
+            print_metrics(
+                final_preds,
+                self._collated_labels,
+                self._device,
+                classes=list(self._dm_labels.keys()),
+            )
 
     @abstractmethod
     def _controller(
diff --git a/chebai/ensemble/_controller.py b/chebai/ensemble/_controller.py
@@ -1,13 +1,14 @@
-from abc import ABC, abstractmethod
+from abc import ABC
 from collections import deque
+from pathlib import Path
 from typing import Any, Deque, Dict
 
 import torch
 from torch import Tensor
 
 from ._base import EnsembleBase
 from ._constants import EVAL_OP, PRED_OP, WRAPPER_CLS_PATH
-from ._utils import _load_class
+from ._utils import load_class
 from ._wrappers import BaseWrapper
 
 
@@ -30,9 +31,16 @@ def __init__(self, **kwargs: Any):
         """
         super().__init__(**kwargs)
         self._kwargs = kwargs
+        # If an activation condition correponding model is added to queue, removed from this set
+        # This is in order to avoid re-adding models that have already been processed
+        self._model_key_set: set[str] = set(self._model_configs.keys())
 
-    @abstractmethod
-    def _controller(self, model_name, model_input, **kwargs: Any) -> Dict[str, Tensor]:
+        # Labels from any processed data.pt file for any reader
+        self._collated_labels: torch.Tensor | None = None
+
+    def _controller(
+        self, model_name: str, model_input: list[str] | Path, **kwargs: Any
+    ) -> Dict[str, Tensor]:
         """
         Performs inference with the model and extracts predictions and confidence values.
 
@@ -82,14 +90,17 @@ def _get_pred_conf_from_model_output(
 
     def _wrap_model(self, model_name: str) -> BaseWrapper:
         model_config = self._model_configs[model_name]
-        wrp_cls = _load_class(model_config[WRAPPER_CLS_PATH])
+        wrp_cls = load_class(model_config[WRAPPER_CLS_PATH])
         assert issubclass(wrp_cls, BaseWrapper), ""
         wrapped_model = wrp_cls(
             model_name=model_name,
             model_config=model_config,
             dm_labels=self._dm_labels,
             **self._kwargs
         )
+        if self._collated_labels is not None and self._operation == EVAL_OP:
+            self._collated_labels = wrapped_model.collated_labels
+
         assert isinstance(wrapped_model, BaseWrapper), ""
         return wrapped_model
 
@@ -110,19 +121,3 @@ def __init__(self, **kwargs: Any):
         """
         super().__init__(**kwargs)
         self._model_queue: Deque[str] = deque(list(self._model_configs.keys()))
-
-    def _controller(self, model_name, model_input, **kwargs: Any) -> Dict[str, Tensor]:
-        """
-        Performs inference with the model and extracts predictions and confidence values.
-
-        Args:
-            model (ChebaiBaseNet): The model to perform inference with.
-            model_props (Dict[str, Tensor]): Dictionary with label mask and trust scores.
-
-        Returns:
-            Dict[str, Tensor]: Dictionary containing predictions and confidence scores.
-        """
-
-        output_dict = super()._controller(model_name, model_input, **kwargs)
-        # Some activation condition can be applied, not in this controller, so we return the output directly
-        return output_dict
diff --git a/chebai/ensemble/_utils.py b/chebai/ensemble/_utils.py
@@ -1,7 +1,7 @@
 import importlib
 
 
-def _load_class(class_path):
+def load_class(class_path: str) -> type:
     module_path, class_name = class_path.rsplit(".", 1)
     module = importlib.import_module(module_path)
     return getattr(module, class_name)
diff --git a/chebai/ensemble/_wrappers/_base.py b/chebai/ensemble/_wrappers/_base.py
@@ -22,6 +22,7 @@ def __init__(
         self._model_class_path = self._model_config[MODEL_CLS_PATH]
         self._model_labels_path = self._model_config[MODEL_LBL_PATH]
         self._model_props = self._generate_model_label_props(dm_labels=dm_labels)
+        self.collated_labels = None
 
     def _generate_model_label_props(self, dm_labels) -> dict[str, torch.Tensor]:
         """
diff --git a/chebai/ensemble/_wrappers/_neural_network.py b/chebai/ensemble/_wrappers/_neural_network.py
@@ -7,9 +7,10 @@
 
 from chebai.models import ChebaiBaseNet
 from chebai.preprocessing.reader import DataReader
+from chebai.preprocessing.structures import XYData
 
 from .._constants import MODEL_CKPT_PATH, READER_CLS_PATH, READER_KWARGS
-from .._utils import _load_class
+from .._utils import load_class
 from ._base import BaseWrapper
 
 
@@ -29,10 +30,11 @@ def __init__(self, **kwargs):
             else dict()
         )
 
-        reader_cls: Type[DataReader] = _load_class(self._reader_class_path)
+        reader_cls: Type[DataReader] = load_class(self._reader_class_path)
         assert issubclass(reader_cls, DataReader), ""
         self._reader = reader_cls(**self._reader_kwargs)
         self._collator = reader_cls.COLLATOR()
+        self.collated_labels = None
         self._model: ChebaiBaseNet = self._load_model_(
             input_dim=kwargs.get("input_dim", None)
         )
@@ -80,7 +82,7 @@ def _load_model_(self, input_dim: int | None) -> ChebaiBaseNet:
                 f"Model path '{self._model_ckpt_path}' for '{self._model_name}' does not exist."
             )
 
-        lightning_cls = _load_class(self._model_class_path)
+        lightning_cls = load_class(self._model_class_path)
 
         assert isinstance(lightning_cls, type), f"{lightning_cls} is not a class."
         assert issubclass(
@@ -137,9 +139,9 @@ def _read_smiles(self, smiles):
         return self._reader.to_data(dict(features=smiles, labels=None))
 
     def _forward_pass(self, batch):
-        processable_data = self._model._process_batch(  # noqa
-            self._collator(batch).to(self._device), 0
-        )
+        collated_batch: XYData = self._collator(batch).to(self._device)
+        self.collated_labels = collated_batch.y
+        processable_data = self._model._process_batch(collated_batch, 0)  # noqa
         return self._model(processable_data, **processable_data["model_kwargs"])
 
     def _evaluate_from_data_file(