add weighted majority voting, gnn and chemlog

sfluegel05 · sfluegel05 · commit 434b52a84325 · 2025-06-24T10:15:57.000+02:00
diff --git a/chebifier/cli.py b/chebifier/cli.py
@@ -5,20 +5,27 @@
 import yaml
 import sys
 from chebifier.ensemble.base_ensemble import BaseEnsemble
+from chebifier.ensemble.weighted_majority_ensemble import WMVwithPPVNPVEnsemble, WMVwithF1Ensemble
 
 
 @click.group()
 def cli():
     """Command line interface for Chebifier."""
     pass
 
+ENSEMBLES = {
+    "mv": BaseEnsemble,
+    "wmv-ppvnpv": WMVwithPPVNPVEnsemble,
+    "wmv-f1": WMVwithF1Ensemble
+}
 
 @cli.command()
 @click.argument('config_file', type=click.Path(exists=True))
 @click.option('--smiles', '-s', multiple=True, help='SMILES strings to predict')
 @click.option('--smiles-file', '-f', type=click.Path(exists=True), help='File containing SMILES strings (one per line)')
 @click.option('--output', '-o', type=click.Path(), help='Output file to save predictions (optional)')
-def predict(config_file, smiles, smiles_file, output):
+@click.option('--ensemble-type', '-e', type=click.Choice(ENSEMBLES.keys()), default='mv', help='Type of ensemble to use (default: Majority Voting)')
+def predict(config_file, smiles, smiles_file, output, ensemble_type):
     """Predict ChEBI classes for SMILES strings using an ensemble model.
     
     CONFIG_FILE is the path to a YAML configuration file for the ensemble model.
@@ -28,7 +35,7 @@ def predict(config_file, smiles, smiles_file, output):
         config = yaml.safe_load(f)
     
     # Instantiate ensemble model
-    ensemble = BaseEnsemble(config)
+    ensemble = ENSEMBLES[ensemble_type](config)
     
     # Collect SMILES strings from arguments and/or file
     smiles_list = list(smiles)
diff --git a/chebifier/ensemble/base_ensemble.py b/chebifier/ensemble/base_ensemble.py
@@ -1,14 +1,18 @@
+import os
 from abc import ABC
 import torch
 import tqdm
 from rdkit import Chem
 
 from chebifier.prediction_models.base_predictor import BasePredictor
+from chebifier.prediction_models.chemlog_predictor import ChemLogPredictor
 from chebifier.prediction_models.electra_predictor import ElectraPredictor
+from chebifier.prediction_models.gnn_predictor import ResGatedPredictor
 
 MODEL_TYPES = {
     "electra": ElectraPredictor,
-    # todo add other model types here
+    "resgated": ResGatedPredictor,
+    "chemlog": ChemLogPredictor
 }
 
 class BaseEnsemble(ABC):
@@ -22,70 +26,73 @@ def __init__(self, model_configs: dict):
             self.models.append(model_instance)
 
     def gather_predictions(self, smiles_list):
-        """
-
-        :param smiles_list: list of SMILES strings to predict
-        :return: 
-            ordered_predictions: torch.Tensor of shape (num_smiles, num_classes, num_models)
-            predicted_classes: list of ChEBI IDs predicted by the models
-        """
         model_predictions = []
         predicted_classes = set()
         for model in self.models:
             model_predictions.append(model.predict_smiles_list(smiles_list))
-            for predicted_smiles in model_predictions[-1]:
-                if predicted_smiles is not None:
-                    for cls in predicted_smiles:
+            for predicted_labels_for_smiles in model_predictions[-1]:
+                if predicted_labels_for_smiles is not None:
+                    for cls in predicted_labels_for_smiles:
                         predicted_classes.add(cls)
         print(f"Sorting predictions...")
         predicted_classes = sorted(list(predicted_classes))
+        predicted_classes = {cls: i for i, cls in enumerate(predicted_classes)}
         ordered_predictions = torch.zeros(len(smiles_list), len(predicted_classes), len(self.models)) * torch.nan
         for i, model_prediction in enumerate(model_predictions):
-            for j, predicted_smiles in tqdm.tqdm(enumerate(model_prediction),
+            for j, predicted_labels_for_smiles in tqdm.tqdm(enumerate(model_prediction),
                                                  total=len(model_prediction),
                                                  desc=f"Sorting predictions for {self.models[i].model_name}"):
-                if predicted_smiles is not None:
-                    for cls in predicted_smiles:
-                        ordered_predictions[j, predicted_classes.index(cls), i] = predicted_smiles[cls]
+                if predicted_labels_for_smiles is not None:
+                    for cls in predicted_labels_for_smiles:
+                        ordered_predictions[j, predicted_classes[cls], i] = predicted_labels_for_smiles[cls]
         return ordered_predictions, predicted_classes
 
 
-    def aggregate_predictions(self, predictions, predicted_classes, **kwargs):
+    def consolidate_predictions(self, predictions, predicted_classes, classwise_weights, **kwargs):
         """
-        Aggregates predictions from multiple models using majority voting.
-
-        :param predictions: torch.Tensor of shape (num_smiles, num_classes, num_models)
-        :param predicted_classes: list of ChEBI IDs predicted by the models
-        :param kwargs: Additional arguments
-        :return: list of lists, where each inner list contains the class IDs that received
-                 positive predictions from the majority of models for a given SMILES
+        Aggregates predictions from multiple models using weighted majority voting.
+        Optimized version using tensor operations instead of for loops.
         """
         num_smiles, num_classes, num_models = predictions.shape
-        result = []
 
-        for i in tqdm.tqdm(range(num_smiles), total=num_smiles, desc="Aggregating predictions"):
-            smiles_result = []
-            for j in range(num_classes):
-                # Get predictions for this SMILES and class across all models
-                class_predictions = predictions[i, j, :]
+        # Create a mapping from class indices to class names for faster lookup
+        class_names = list(predicted_classes.keys())
+        class_indices = {predicted_classes[cls]: cls for cls in class_names}
+
+        # Get predictions for all classes
+        valid_predictions = ~torch.isnan(predictions)
+        valid_counts = valid_predictions.sum(dim=2)  # Sum over models dimension
+
+        # Skip classes with no valid predictions
+        has_valid_predictions = valid_counts > 0
+
+        # Calculate positive and negative predictions for all classes at once
+        positive_mask = (predictions > 0.5) & valid_predictions
+        negative_mask = (predictions < 0.5) & valid_predictions
 
-                # Count models that made a prediction (not NaN)
-                valid_predictions = ~torch.isnan(class_predictions)
-                num_valid_predictions = valid_predictions.sum().item()
+        # Extract positive and negative weights
+        pos_weights = classwise_weights[0]  # Shape: (num_classes, num_models)
+        neg_weights = classwise_weights[1]  # Shape: (num_classes, num_models)
 
-                # If no valid predictions, skip this class
-                if num_valid_predictions == 0:
-                    continue
+        # Calculate weighted predictions using broadcasting
+        # predictions shape: (num_smiles, num_classes, num_models)
+        # weights shape: (num_classes, num_models)
+        positive_weighted = positive_mask.float() * (predictions.nan_to_num() - 0.5) * pos_weights.unsqueeze(0)
+        negative_weighted = negative_mask.float() * (0.5 - predictions.nan_to_num()) * neg_weights.unsqueeze(0)
 
-                # Count positive predictions (assuming positive is > 0)
-                positive_predictions = class_predictions > 0
-                num_positive = (positive_predictions & valid_predictions).sum().item()
+        # Sum over models dimension
+        positive_sum = positive_weighted.sum(dim=2)  # Shape: (num_smiles, num_classes)
+        negative_sum = negative_weighted.sum(dim=2)  # Shape: (num_smiles, num_classes)
 
-                # If majority of models that made a prediction are positive, add this class
-                if num_positive > num_valid_predictions / 2:
-                    smiles_result.append(predicted_classes[j])
+        # Determine which classes to include for each SMILES
+        net_score = positive_sum - negative_sum  # Shape: (num_smiles, num_classes)
+        class_decisions = (net_score > 0) & has_valid_predictions  # Shape: (num_smiles, num_classes)
 
-            result.append(smiles_result)
+        # Convert tensor decisions to result list using list comprehension for efficiency
+        result = [
+            [class_indices[idx.item()] for idx in torch.nonzero(class_decisions[i], as_tuple=True)[0]]
+            for i in range(num_smiles)
+        ]
 
         return result
 
@@ -102,8 +109,30 @@ def normalize_smiles_list(self, smiles_list):
             new.append(canonical_smiles)
         return new
 
-    def predict_smiles_list(self, smiles_list) -> list:
-        #smiles_list = self.normalize_smiles_list(smiles_list)
-        ordered_predictions, predicted_classes = self.gather_predictions(smiles_list)
-        aggregated_predictions = self.aggregate_predictions(ordered_predictions, predicted_classes)
+    def calculate_classwise_weights(self, predicted_classes):
+        """No weights, simple majority voting"""
+        positive_weights = torch.ones(len(predicted_classes), len(self.models))
+        negative_weights = torch.ones(len(predicted_classes), len(self.models))
+
+        return positive_weights, negative_weights
+
+    def predict_smiles_list(self, smiles_list, load_preds_if_possible=True) -> list:
+        preds_file = f"predictions_by_model_{'_'.join(model.model_name for model in self.models)}.pt"
+        predicted_classes_file = f"predicted_classes_{'_'.join(model.model_name for model in self.models)}.txt"
+        if not load_preds_if_possible or not os.path.isfile(preds_file):
+            #smiles_list = self.normalize_smiles_list(smiles_list)
+            ordered_predictions, predicted_classes = self.gather_predictions(smiles_list)
+            # save predictions
+            torch.save(ordered_predictions, preds_file)
+            with open(predicted_classes_file, "w") as f:
+                for cls in predicted_classes:
+                    f.write(f"{cls}\n")
+        else:
+            print(f"Loading predictions from {preds_file} and label indexes from {predicted_classes_file}")
+            ordered_predictions = torch.load(preds_file)
+            with open(predicted_classes_file, "r") as f:
+                predicted_classes = {line.strip(): i for i, line in enumerate(f.readlines())}
+
+        classwise_weights = self.calculate_classwise_weights(predicted_classes)
+        aggregated_predictions = self.consolidate_predictions(ordered_predictions, predicted_classes, classwise_weights)
         return aggregated_predictions
diff --git a/chebifier/ensemble/weighted_majority_ensemble.py b/chebifier/ensemble/weighted_majority_ensemble.py
@@ -0,0 +1,54 @@
+import torch
+
+from chebifier.ensemble.base_ensemble import BaseEnsemble
+
+
+
+class WMVwithPPVNPVEnsemble(BaseEnsemble):
+
+    def calculate_classwise_weights(self, predicted_classes):
+        """
+        Given the positions of predicted classes in the predictions tensor, assign weights to each class. The
+        result is two tensors of shape (num_predicted_classes, num_models). The weight for each class is the model_weight
+        (default: 1) multiplied by the class-specific positive / negative weight (default 1).
+        """
+        positive_weights = torch.ones(len(predicted_classes), len(self.models))
+        negative_weights = torch.ones(len(predicted_classes), len(self.models))
+        for j, model in enumerate(self.models):
+            positive_weights[:, j] *= model.model_weight
+            negative_weights[:, j] *= model.model_weight
+            if model.classwise_weights is None:
+                continue
+            for cls, weights in model.classwise_weights.items():
+                positive_weights[predicted_classes[cls], j] *= weights["PPV"]
+                negative_weights[predicted_classes[cls], j] *= weights["NPV"]
+
+        print(f"Calculated model weightings. The averages for positive / negative weights are:")
+        for i, model in enumerate(self.models):
+            print(f"{model.model_name}: {positive_weights[:, i].mean().item():.3f} / {negative_weights[:, i].mean().item():.3f}")
+
+        return positive_weights, negative_weights
+
+
+class WMVwithF1Ensemble(BaseEnsemble):
+
+    def calculate_classwise_weights(self, predicted_classes):
+        """
+        Given the positions of predicted classes in the predictions tensor, assign weights to each class. The
+        result is two tensors of shape (num_predicted_classes, num_models). The weight for each class is the model_weight
+        (default: 1) multiplied by the class-specific validation-f1 (default 1).
+        """
+        weights_by_cls = torch.ones(len(predicted_classes), len(self.models))
+        for j, model in enumerate(self.models):
+            weights_by_cls[:, j] *= model.model_weight
+            if model.classwise_weights is None:
+                continue
+            for cls, weights in model.classwise_weights.items():
+                f1 = 2 * weights["TP"] / (2 * weights["TP"] + weights["FP"] + weights["FN"])
+                weights_by_cls[predicted_classes[cls], j] *= f1
+
+        print(f"Calculated model weightings. The average weights are:")
+        for i, model in enumerate(self.models):
+            print(f"{model.model_name}: {weights_by_cls[:, i].mean().item():.3f}")
+
+        return weights_by_cls, weights_by_cls
diff --git a/chebifier/prediction_models/base_predictor.py b/chebifier/prediction_models/base_predictor.py
@@ -1,10 +1,16 @@
 from abc import ABC
-
+import json
 
 class BasePredictor(ABC):
 
-    def __init__(self, model_name: str, **kwargs):
+    def __init__(self, model_name: str, model_weight: int = 1, classwise_weights_path: str = None, **kwargs):
         self.model_name = model_name
+        self.model_weight = model_weight
+        if classwise_weights_path is not None:
+            self.classwise_weights = json.load(open(classwise_weights_path, encoding="utf-8"))
+        else:
+            self.classwise_weights = None
+
 
     def predict_smiles_list(self, smiles_list: list[str]) -> dict:
         raise NotImplementedError
diff --git a/chebifier/prediction_models/chemlog_predictor.py b/chebifier/prediction_models/chemlog_predictor.py
@@ -0,0 +1,35 @@
+import tqdm
+
+from chebifier.prediction_models.base_predictor import BasePredictor
+from chemlog.alg_classification.charge_classifier import AlgChargeClassifier
+from chemlog.alg_classification.peptide_size_classifier import AlgPeptideSizeClassifier
+from chemlog.alg_classification.proteinogenics_classifier import AlgProteinogenicsClassifier
+from chemlog.alg_classification.substructure_classifier import AlgSubstructureClassifier
+from chemlog.cli import strategy_call, _smiles_to_mol, CLASSIFIERS
+
+class ChemLogPredictor(BasePredictor):
+
+    def __init__(self, model_name: str, **kwargs):
+        super().__init__(model_name, **kwargs)
+        self.strategy = "algo"
+        self.classifier_instances = {
+            k: v() for k, v in CLASSIFIERS[self.strategy].items()
+        }
+        self.peptide_labels = ["15841", "16670", "24866", "25676", "25696", "25697", "27369", "46761", "47923",
+                               "48030", "48545", "60194", "60334", "60466", "64372", "65061", "90799", "155837"]
+
+        print(f"Initialised ChemLog model {self.model_name}")
+
+    def predict_smiles_list(self, smiles_list: list[str]) -> list:
+        results = []
+        for i, smiles in tqdm.tqdm(enumerate(smiles_list)):
+            mol = _smiles_to_mol(smiles)
+            if mol is None:
+                results.append(None)
+            else:
+                results.append({label: 1 if label in strategy_call(self.strategy, self.classifier_instances, mol)["chebi_classes"] else 0 for label in self.peptide_labels})
+
+        for classifier in self.classifier_instances.values():
+            classifier.on_finish()
+
+        return results
diff --git a/chebifier/prediction_models/gnn_predictor.py b/chebifier/prediction_models/gnn_predictor.py
diff --git a/chebifier/prediction_models/nn_predictor.py b/chebifier/prediction_models/nn_predictor.py