save prediction to csv for predict operation mode

aditya0by0 · aditya0by0 · commit 93e9b7398060 · 2025-06-02T16:28:13.000+02:00
diff --git a/chebai/ensemble/__init__.py b/chebai/ensemble/__init__.py
@@ -1,4 +1,3 @@
-from ._base import EnsembleBase
 from ._consolidator import WeightedMajorityVoting
 from ._controller import NoActivationCondition
 from ._wrappers import NNWrapper
diff --git a/chebai/ensemble/_base.py b/chebai/ensemble/_base.py
@@ -1,11 +1,10 @@
 from abc import ABC, abstractmethod
 from collections import deque
 from pathlib import Path
-from typing import Any, Deque, Dict, Optional
+from typing import Any, Deque, Dict
 
 import pandas as pd
 import torch
-from lightning import LightningModule
 
 from chebai.result.classification import print_metrics
 
@@ -29,7 +28,7 @@ def __init__(
         self,
         model_configs: Dict[str, Dict[str, Any]],
         data_processed_dir_main: str,
-        operation: str = EVAL_OP,
+        operation_mode: str = EVAL_OP,
         **kwargs: Any,
     ) -> None:
         """
@@ -42,13 +41,13 @@ def __init__(
         """
         if bool(kwargs.get("_perform_validation_checks", True)):
             self._perform_validation_checks(
-                model_configs, operation=operation, **kwargs
+                model_configs, operation=operation_mode, **kwargs
             )
 
         self._model_configs: Dict[str, Dict[str, Any]] = model_configs
         self._data_processed_dir_main: str = data_processed_dir_main
-        self._operation: str = operation
-        print(f"Ensemble operation: {self._operation}")
+        self._operation_mode: str = operation_mode
+        print(f"Ensemble operation: {self._operation_mode}")
 
         # These instance variable will be set in method `_process_input_to_ensemble`
         self._total_data_size: int | None = None
@@ -126,7 +125,7 @@ def _perform_validation_checks(
             labels_set.add(model_labels_path)
 
     def _process_input_to_ensemble(self, **kwargs: Any) -> list[str] | Path:
-        if self._operation == PRED_OP:
+        if self._operation_mode == PRED_OP:
             p = Path(kwargs["smiles_list_file_path"])
             smiles_list: list[str] = []
             with open(p, "r") as f:
@@ -138,7 +137,7 @@ def _process_input_to_ensemble(self, **kwargs: Any) -> list[str] | Path:
                         smiles_list.append(smiles)
             self._total_data_size = len(smiles_list)
             return smiles_list
-        elif self._operation == EVAL_OP:
+        elif self._operation_mode == EVAL_OP:
             processed_dir_path = Path(self._data_processed_dir_main)
             data_pkl_path = processed_dir_path / "data.pkl"
             if not data_pkl_path.exists():
@@ -183,7 +182,7 @@ def run_ensemble(self) -> None:
         )
 
         print(
-            f"Running {self.__class__.__name__} ensemble for {self._operation} operation..."
+            f"Running {self.__class__.__name__} ensemble for {self._operation_mode} operation..."
         )
         while self._model_queue:
             model_name = self._model_queue.popleft()
@@ -204,7 +203,7 @@ def run_ensemble(self) -> None:
             true_scores=true_scores, false_scores=false_scores
         )
 
-        if self._operation == EVAL_OP:
+        if self._operation_mode == EVAL_OP:
             assert (
                 self._collated_labels is not None
             ), "Collated labels must be set for evaluation operation."
@@ -214,6 +213,31 @@ def run_ensemble(self) -> None:
                 self._device,
                 classes=list(self._dm_labels.keys()),
             )
+        else:
+            # Get SMILES and label names
+            smiles_list = self._ensemble_input
+            label_names = list(self._dm_labels.keys())
+            # Efficient conversion from tensor to NumPy
+            preds_np = final_preds.detach().cpu().numpy()
+
+            assert (
+                len(smiles_list) == preds_np.shape[0]
+            ), "Length of SMILES list does not match number of predictions."
+            assert (
+                len(label_names) == preds_np.shape[1]
+            ), "Number of label names does not match number of predictions."
+
+            # Build DataFrame
+            df = pd.DataFrame(preds_np, columns=label_names)
+            df.insert(0, "SMILES", smiles_list)
+
+            # Save to CSV
+            output_path = (
+                Path(self._data_processed_dir_main) / "ensemble_predictions.csv"
+            )
+            df.to_csv(output_path, index=False)
+
+            print(f"Predictions saved to {output_path}")
 
     @abstractmethod
     def _controller(
diff --git a/chebai/ensemble/_controller.py b/chebai/ensemble/_controller.py
@@ -52,7 +52,7 @@ def _controller(
             Dict[str, Tensor]: Dictionary containing predictions and confidence scores.
         """
         wrapped_model = self._wrap_model(model_name)
-        if self._operation == PRED_OP:
+        if self._operation_mode == PRED_OP:
             model_output, model_props = wrapped_model.predict(model_input)
         else:
             model_output, model_props = wrapped_model.evaluate(model_input)
diff --git a/chebai/ensemble/_wrappers/_base.py b/chebai/ensemble/_wrappers/_base.py
@@ -43,7 +43,7 @@ def _generate_model_label_props(self, dm_labels) -> dict[str, torch.Tensor]:
                 except Exception as e:
                     raise Exception(
                         f"Label '{label}' has an unexpected error \n Error: {e}"
-                    )
+                    ) from e
 
                 model_label_indices.append(dm_labels[label])
                 tpv_label_values.append(props["TPV"])
diff --git a/chebai/ensemble/_wrappers/_neural_network.py b/chebai/ensemble/_wrappers/_neural_network.py
@@ -126,7 +126,6 @@ def _predict_from_list_of_smiles(self, smiles_list: list[str]) -> list:
                 else:
                     index_map[i] = len(token_dicts)
                     token_dicts.append(d)
-        print(f"Predicting {len(token_dicts), token_dicts} out of {len(smiles_list)}")
         if token_dicts:
             model_output = self._forward_pass(token_dicts)
             if not isinstance(model_output, dict) and not "logits" in model_output:

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-from ._base import EnsembleBase`
`2`	`1`	`from ._consolidator import WeightedMajorityVoting`
`3`	`2`	`from ._controller import NoActivationCondition`
`4`	`3`	`from ._wrappers import NNWrapper`