Add ensemble for electra models, majority voting

sfluegel05 · sfluegel05 · commit 6f2c3ab69330 · 2025-06-18T11:24:47.000+02:00
diff --git a/README.md b/README.md
@@ -1,2 +1,86 @@
 # python-chebifier
 An AI ensemble model for predicting chemical classes.
+
+## Installation
+
+```bash
+# Clone the repository
+git clone https://github.com/yourusername/python-chebifier.git
+cd python-chebifier
+
+# Install the package
+pip install -e .
+```
+
+## Usage
+
+### Command Line Interface
+
+The package provides a command-line interface (CLI) for making predictions using an ensemble model.
+
+```bash
+# Get help
+python -m chebifier.cli --help
+
+# Make predictions using a configuration file
+python -m chebifier.cli predict example_config.yml --smiles "CC(=O)OC1=CC=CC=C1C(=O)O" "C1=CC=C(C=C1)C(=O)O"
+
+# Make predictions using SMILES from a file
+python -m chebifier.cli predict example_config.yml --smiles-file smiles.txt
+```
+
+### Configuration File
+
+The CLI requires a YAML configuration file that defines the ensemble model. Here's an example:
+
+```yaml
+# Example configuration file for Chebifier ensemble model
+
+# Each key in the top-level dictionary is a model name
+model1:
+  # Required: type of model (must be one of the keys in MODEL_TYPES)
+  type: electra
+  # Required: name of the model
+  model_name: electra_model1
+  # Required: path to the checkpoint file
+  ckpt_path: /path/to/checkpoint1.ckpt
+  # Required: path to the target labels file
+  target_labels_path: /path/to/target_labels1.txt
+  # Optional: batch size for predictions (default is likely defined in the model)
+  batch_size: 32
+
+model2:
+  type: electra
+  model_name: electra_model2
+  ckpt_path: /path/to/checkpoint2.ckpt
+  target_labels_path: /path/to/target_labels2.txt
+  batch_size: 64
+```
+
+### Python API
+
+You can also use the package programmatically:
+
+```python
+from chebifier.ensemble.base_ensemble import BaseEnsemble
+import yaml
+
+# Load configuration from YAML file
+with open('configs/example_config.yml', 'r') as f:
+    config = yaml.safe_load(f)
+
+# Instantiate ensemble model
+ensemble = BaseEnsemble(config)
+
+# Make predictions
+smiles_list = ["CC(=O)OC1=CC=CC=C1C(=O)O", "C1=CC=C(C=C1)C(=O)O"]
+predictions = ensemble.predict_smiles_list(smiles_list)
+
+# Print results
+for smile, prediction in zip(smiles_list, predictions):
+    print(f"SMILES: {smile}")
+    if prediction:
+        print(f"Predicted classes: {prediction}")
+    else:
+        print("No predictions")
+```
diff --git a/chebifier/__init__.py b/chebifier/__init__.py
diff --git a/chebifier/cli.py b/chebifier/cli.py
@@ -0,0 +1,63 @@
+
+
+
+import click
+import yaml
+import sys
+from chebifier.ensemble.base_ensemble import BaseEnsemble
+
+
+@click.group()
+def cli():
+    """Command line interface for Chebifier."""
+    pass
+
+
+@cli.command()
+@click.argument('config_file', type=click.Path(exists=True))
+@click.option('--smiles', '-s', multiple=True, help='SMILES strings to predict')
+@click.option('--smiles-file', '-f', type=click.Path(exists=True), help='File containing SMILES strings (one per line)')
+@click.option('--output', '-o', type=click.Path(), help='Output file to save predictions (optional)')
+def predict(config_file, smiles, smiles_file, output):
+    """Predict ChEBI classes for SMILES strings using an ensemble model.
+    
+    CONFIG_FILE is the path to a YAML configuration file for the ensemble model.
+    """
+    # Load configuration from YAML file
+    with open(config_file, 'r') as f:
+        config = yaml.safe_load(f)
+    
+    # Instantiate ensemble model
+    ensemble = BaseEnsemble(config)
+    
+    # Collect SMILES strings from arguments and/or file
+    smiles_list = list(smiles)
+    if smiles_file:
+        with open(smiles_file, 'r') as f:
+            smiles_list.extend([line.strip() for line in f if line.strip()])
+    
+    if not smiles_list:
+        click.echo("No SMILES strings provided. Use --smiles or --smiles-file options.")
+        return
+
+    # Make predictions
+    predictions = ensemble.predict_smiles_list(smiles_list)
+
+    if output:
+        # save as json
+        import json
+        with open(output, 'w') as f:
+            json.dump({smiles: pred for smiles, pred in zip(smiles_list, predictions)}, f, indent=2)
+
+    else:
+        # Print results
+        for i, (smiles, prediction) in enumerate(zip(smiles_list, predictions)):
+            click.echo(f"Result for: {smiles}")
+            if prediction:
+                click.echo(f"  Predicted classes: {', '.join(map(str, prediction))}")
+            else:
+                click.echo("  No predictions")
+
+
+if __name__ == '__main__':
+    cli()
diff --git a/chebifier/ensemble/__init__.py b/chebifier/ensemble/__init__.py
diff --git a/chebifier/ensemble/base_ensemble.py b/chebifier/ensemble/base_ensemble.py
@@ -0,0 +1,109 @@
+from abc import ABC
+import torch
+import tqdm
+from rdkit import Chem
+
+from chebifier.prediction_models.base_predictor import BasePredictor
+from chebifier.prediction_models.electra_predictor import ElectraPredictor
+
+MODEL_TYPES = {
+    "electra": ElectraPredictor,
+    # todo add other model types here
+}
+
+class BaseEnsemble(ABC):
+
+    def __init__(self, model_configs: dict):
+        self.models = []
+        for model_name, model_config in model_configs.items():
+            model_cls = MODEL_TYPES[model_config["type"]]
+            model_instance = model_cls(**model_config)
+            assert isinstance(model_instance, BasePredictor)
+            self.models.append(model_instance)
+
+    def gather_predictions(self, smiles_list):
+        """
+
+        :param smiles_list: list of SMILES strings to predict
+        :return: 
+            ordered_predictions: torch.Tensor of shape (num_smiles, num_classes, num_models)
+            predicted_classes: list of ChEBI IDs predicted by the models
+        """
+        model_predictions = []
+        predicted_classes = set()
+        for model in self.models:
+            model_predictions.append(model.predict_smiles_list(smiles_list))
+            for predicted_smiles in model_predictions[-1]:
+                if predicted_smiles is not None:
+                    for cls in predicted_smiles:
+                        predicted_classes.add(cls)
+        print(f"Sorting predictions...")
+        predicted_classes = sorted(list(predicted_classes))
+        ordered_predictions = torch.zeros(len(smiles_list), len(predicted_classes), len(self.models)) * torch.nan
+        for i, model_prediction in enumerate(model_predictions):
+            for j, predicted_smiles in tqdm.tqdm(enumerate(model_prediction),
+                                                 total=len(model_prediction),
+                                                 desc=f"Sorting predictions for {self.models[i].model_name}"):
+                if predicted_smiles is not None:
+                    for cls in predicted_smiles:
+                        ordered_predictions[j, predicted_classes.index(cls), i] = predicted_smiles[cls]
+        return ordered_predictions, predicted_classes
+
+
+    def aggregate_predictions(self, predictions, predicted_classes, **kwargs):
+        """
+        Aggregates predictions from multiple models using majority voting.
+
+        :param predictions: torch.Tensor of shape (num_smiles, num_classes, num_models)
+        :param predicted_classes: list of ChEBI IDs predicted by the models
+        :param kwargs: Additional arguments
+        :return: list of lists, where each inner list contains the class IDs that received
+                 positive predictions from the majority of models for a given SMILES
+        """
+        num_smiles, num_classes, num_models = predictions.shape
+        result = []
+
+        for i in tqdm.tqdm(range(num_smiles), total=num_smiles, desc="Aggregating predictions"):
+            smiles_result = []
+            for j in range(num_classes):
+                # Get predictions for this SMILES and class across all models
+                class_predictions = predictions[i, j, :]
+
+                # Count models that made a prediction (not NaN)
+                valid_predictions = ~torch.isnan(class_predictions)
+                num_valid_predictions = valid_predictions.sum().item()
+
+                # If no valid predictions, skip this class
+                if num_valid_predictions == 0:
+                    continue
+
+                # Count positive predictions (assuming positive is > 0)
+                positive_predictions = class_predictions > 0
+                num_positive = (positive_predictions & valid_predictions).sum().item()
+
+                # If majority of models that made a prediction are positive, add this class
+                if num_positive > num_valid_predictions / 2:
+                    smiles_result.append(predicted_classes[j])
+
+            result.append(smiles_result)
+
+        return result
+
+    def normalize_smiles_list(self, smiles_list):
+        new = []
+        print(f"Normalizing SMILES strings...")
+        for smiles in tqdm.tqdm(smiles_list):
+            try:
+                mol = Chem.MolFromSmiles(smiles)
+                canonical_smiles = Chem.MolToSmiles(mol)
+            except Exception as e:
+                print(f"Failed to parse SMILES '{smiles}': {e}")
+                canonical_smiles = None
+            new.append(canonical_smiles)
+        return new
+
+    def predict_smiles_list(self, smiles_list) -> list:
+        #smiles_list = self.normalize_smiles_list(smiles_list)
+        ordered_predictions, predicted_classes = self.gather_predictions(smiles_list)
+        aggregated_predictions = self.aggregate_predictions(ordered_predictions, predicted_classes)
+        return aggregated_predictions
diff --git a/chebifier/prediction_models/__init__.py b/chebifier/prediction_models/__init__.py
diff --git a/chebifier/prediction_models/base_predictor.py b/chebifier/prediction_models/base_predictor.py
@@ -0,0 +1,10 @@
+from abc import ABC
+
+
+class BasePredictor(ABC):
+
+    def __init__(self, model_name: str, **kwargs):
+        self.model_name = model_name
+
+    def predict_smiles_list(self, smiles_list: list[str]) -> dict:
+        raise NotImplementedError
diff --git a/chebifier/prediction_models/electra_predictor.py b/chebifier/prediction_models/electra_predictor.py
@@ -0,0 +1,22 @@
+from chebifier.prediction_models.nn_predictor import NNPredictor
+from chebai.models.electra import Electra
+from chebai.preprocessing.reader import ChemDataReader
+
+
+class ElectraPredictor(NNPredictor):
+
+    def __init__(self, model_name: str, ckpt_path: str, **kwargs):
+        super().__init__(model_name, ckpt_path, reader_cls=ChemDataReader, **kwargs)
+        print(f"Initialised Electra model {self.model_name} (device: {self.device})")
+
+    def init_model(self, ckpt_path: str, **kwargs) -> Electra:
+        model = Electra.load_from_checkpoint(
+            ckpt_path,
+            map_location=self.device,
+            criterion=None, strict=False,
+            metrics=dict(train=dict(), test=dict(), validation=dict()), pretrained_checkpoint=None
+        )
+        model.eval()
+        return model
+
+
diff --git a/chebifier/prediction_models/nn_predictor.py b/chebifier/prediction_models/nn_predictor.py
@@ -0,0 +1,79 @@
+import tqdm
+
+from chebifier.prediction_models.base_predictor import BasePredictor
+from rdkit import Chem
+import numpy as np
+import torch
+
+class NNPredictor(BasePredictor):
+
+    def __init__(self, model_name: str, ckpt_path: str, reader_cls, target_labels_path: str, **kwargs):
+        super().__init__(model_name, **kwargs)
+        self.reader_cls = reader_cls
+
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.model = self.init_model(ckpt_path=ckpt_path)
+        self.target_labels = [line.strip() for line in open(target_labels_path, encoding="utf-8")]
+        self.batch_size = kwargs.get("batch_size", 1)
+
+
+    def init_model(self, ckpt_path: str, **kwargs):
+        raise NotImplementedError("Model initialization must be implemented in subclasses.")
+
+    def calculate_results(self, batch):
+        collator = self.reader_cls.COLLATOR()
+        dat = self.model._process_batch(collator(batch).to(self.device), 0)
+        return self.model(dat, **dat["model_kwargs"])
+
+    def batchify(self, batch):
+        cache = []
+        for r in batch:
+            cache.append(r)
+            if len(cache) >= self.batch_size:
+                yield cache
+                cache = []
+        if cache:
+            yield cache
+
+    def read_smiles(self, smiles):
+        reader = self.reader_cls()
+        d = reader.to_data(dict(features=smiles, labels=None))
+        return d
+
+    def predict_smiles_list(self, smiles_list) -> list:
+        """Returns a list with the length of smiles_list, each element is either None (=failure) or a dictionary
+        Of classes and predicted values."""
+        token_dicts = []
+        could_not_parse = []
+        index_map = dict()
+        for i, smiles in enumerate(smiles_list):
+            try:
+                # Try to parse the smiles string
+                if not smiles:
+                    raise ValueError()
+                d = self.read_smiles(smiles)
+                # This is just for sanity checks
+                rdmol = Chem.MolFromSmiles(smiles, sanitize=False)
+            except Exception as e:
+                # Note if it fails
+                could_not_parse.append(i)
+                print(f"Failing to parse {smiles} due to {e}")
+            else:
+                if rdmol is None:
+                    could_not_parse.append(i)
+                else:
+                    index_map[i] = len(token_dicts)
+                    token_dicts.append(d)
+        results = []
+        if token_dicts:
+            for batch in tqdm.tqdm(self.batchify(token_dicts), desc=f"{self.model_name}", total=len(token_dicts)//self.batch_size):
+                result = self.calculate_results(batch)
+                if isinstance(result, dict) and "logits" in result:
+                    result = result["logits"]
+                results += result.cpu().detach().tolist()
+            results = np.stack(results, axis=0)
+            preds = [{self.target_labels[j]: p for j, p in enumerate(results[index_map[i]])}
+                              if i not in could_not_parse else None for i in range(len(smiles_list))]
+            return preds
+        else:
+            return [None for _ in smiles_list]