pre-commit -run -a

aditya0by0 · aditya0by0 · commit 9c3beea54298 · 2025-06-28T19:33:08.000+02:00
diff --git a/chebifier/cli.py b/chebifier/cli.py
@@ -1,48 +1,66 @@
-
-
-
 import click
 import yaml
-import sys
+
 from chebifier.ensemble.base_ensemble import BaseEnsemble
-from chebifier.ensemble.weighted_majority_ensemble import WMVwithPPVNPVEnsemble, WMVwithF1Ensemble
+from chebifier.ensemble.weighted_majority_ensemble import (
+    WMVwithF1Ensemble,
+    WMVwithPPVNPVEnsemble,
+)
 
 
 @click.group()
 def cli():
     """Command line interface for Chebifier."""
     pass
 
+
 ENSEMBLES = {
     "mv": BaseEnsemble,
     "wmv-ppvnpv": WMVwithPPVNPVEnsemble,
-    "wmv-f1": WMVwithF1Ensemble
+    "wmv-f1": WMVwithF1Ensemble,
 }
 
+
 @cli.command()
-@click.argument('config_file', type=click.Path(exists=True))
-@click.option('--smiles', '-s', multiple=True, help='SMILES strings to predict')
-@click.option('--smiles-file', '-f', type=click.Path(exists=True), help='File containing SMILES strings (one per line)')
-@click.option('--output', '-o', type=click.Path(), help='Output file to save predictions (optional)')
-@click.option('--ensemble-type', '-e', type=click.Choice(ENSEMBLES.keys()), default='mv', help='Type of ensemble to use (default: Majority Voting)')
+@click.argument("config_file", type=click.Path(exists=True))
+@click.option("--smiles", "-s", multiple=True, help="SMILES strings to predict")
+@click.option(
+    "--smiles-file",
+    "-f",
+    type=click.Path(exists=True),
+    help="File containing SMILES strings (one per line)",
+)
+@click.option(
+    "--output",
+    "-o",
+    type=click.Path(),
+    help="Output file to save predictions (optional)",
+)
+@click.option(
+    "--ensemble-type",
+    "-e",
+    type=click.Choice(ENSEMBLES.keys()),
+    default="mv",
+    help="Type of ensemble to use (default: Majority Voting)",
+)
 def predict(config_file, smiles, smiles_file, output, ensemble_type):
     """Predict ChEBI classes for SMILES strings using an ensemble model.
-    
+
     CONFIG_FILE is the path to a YAML configuration file for the ensemble model.
     """
     # Load configuration from YAML file
-    with open(config_file, 'r') as f:
+    with open(config_file, "r") as f:
         config = yaml.safe_load(f)
-    
+
     # Instantiate ensemble model
     ensemble = ENSEMBLES[ensemble_type](config)
-    
+
     # Collect SMILES strings from arguments and/or file
     smiles_list = list(smiles)
     if smiles_file:
-        with open(smiles_file, 'r') as f:
+        with open(smiles_file, "r") as f:
             smiles_list.extend([line.strip() for line in f if line.strip()])
-    
+
     if not smiles_list:
         click.echo("No SMILES strings provided. Use --smiles or --smiles-file options.")
         return
@@ -53,8 +71,13 @@ def predict(config_file, smiles, smiles_file, output, ensemble_type):
     if output:
         # save as json
         import json
-        with open(output, 'w') as f:
-            json.dump({smiles: pred for smiles, pred in zip(smiles_list, predictions)}, f, indent=2)
+
+        with open(output, "w") as f:
+            json.dump(
+                {smiles: pred for smiles, pred in zip(smiles_list, predictions)},
+                f,
+                indent=2,
+            )
 
     else:
         # Print results
@@ -66,5 +89,5 @@ def predict(config_file, smiles, smiles_file, output, ensemble_type):
                 click.echo("  No predictions")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     cli()
diff --git a/chebifier/ensemble/weighted_majority_ensemble.py b/chebifier/ensemble/weighted_majority_ensemble.py
@@ -3,9 +3,7 @@
 from chebifier.ensemble.base_ensemble import BaseEnsemble
 
 
-
 class WMVwithPPVNPVEnsemble(BaseEnsemble):
-
     def calculate_classwise_weights(self, predicted_classes):
         """
         Given the positions of predicted classes in the predictions tensor, assign weights to each class. The
@@ -23,15 +21,18 @@ def calculate_classwise_weights(self, predicted_classes):
                 positive_weights[predicted_classes[cls], j] *= weights["PPV"]
                 negative_weights[predicted_classes[cls], j] *= weights["NPV"]
 
-        print(f"Calculated model weightings. The averages for positive / negative weights are:")
+        print(
+            "Calculated model weightings. The averages for positive / negative weights are:"
+        )
         for i, model in enumerate(self.models):
-            print(f"{model.model_name}: {positive_weights[:, i].mean().item():.3f} / {negative_weights[:, i].mean().item():.3f}")
+            print(
+                f"{model.model_name}: {positive_weights[:, i].mean().item():.3f} / {negative_weights[:, i].mean().item():.3f}"
+            )
 
         return positive_weights, negative_weights
 
 
 class WMVwithF1Ensemble(BaseEnsemble):
-
     def calculate_classwise_weights(self, predicted_classes):
         """
         Given the positions of predicted classes in the predictions tensor, assign weights to each class. The
@@ -45,11 +46,15 @@ def calculate_classwise_weights(self, predicted_classes):
                 continue
             for cls, weights in model.classwise_weights.items():
                 if (2 * weights["TP"] + weights["FP"] + weights["FN"]) > 0:
-                    f1 = 2 * weights["TP"] / (2 * weights["TP"] + weights["FP"] + weights["FN"])
+                    f1 = (
+                        2
+                        * weights["TP"]
+                        / (2 * weights["TP"] + weights["FP"] + weights["FN"])
+                    )
                     weights_by_cls[predicted_classes[cls], j] *= f1
 
-        print(f"Calculated model weightings. The average weights are:")
+        print("Calculated model weightings. The average weights are:")
         for i, model in enumerate(self.models):
             print(f"{model.model_name}: {weights_by_cls[:, i].mean().item():.3f}")
 
-        return weights_by_cls, weights_by_cls
+        return weights_by_cls, weights_by_cls
diff --git a/chebifier/prediction_models/base_predictor.py b/chebifier/prediction_models/base_predictor.py
@@ -1,16 +1,24 @@
-from abc import ABC
 import json
+from abc import ABC
+
 
 class BasePredictor(ABC):
 
-    def __init__(self, model_name: str, model_weight: int = 1, classwise_weights_path: str = None, **kwargs):
+    def __init__(
+        self,
+        model_name: str,
+        model_weight: int = 1,
+        classwise_weights_path: str = None,
+        **kwargs
+    ):
         self.model_name = model_name
         self.model_weight = model_weight
         if classwise_weights_path is not None:
-            self.classwise_weights = json.load(open(classwise_weights_path, encoding="utf-8"))
+            self.classwise_weights = json.load(
+                open(classwise_weights_path, encoding="utf-8")
+            )
         else:
             self.classwise_weights = None
 
-
     def predict_smiles_list(self, smiles_list: list[str]) -> dict:
-        raise NotImplementedError
+        raise NotImplementedError
diff --git a/chebifier/prediction_models/chemlog_predictor.py b/chebifier/prediction_models/chemlog_predictor.py
@@ -1,23 +1,22 @@
 import tqdm
+from chemlog.cli import CLASSIFIERS, _smiles_to_mol, strategy_call
 
 from chebifier.prediction_models.base_predictor import BasePredictor
-from chemlog.alg_classification.charge_classifier import AlgChargeClassifier
-from chemlog.alg_classification.peptide_size_classifier import AlgPeptideSizeClassifier
-from chemlog.alg_classification.proteinogenics_classifier import AlgProteinogenicsClassifier
-from chemlog.alg_classification.substructure_classifier import AlgSubstructureClassifier
-from chemlog.cli import strategy_call, _smiles_to_mol, CLASSIFIERS
 
-class ChemLogPredictor(BasePredictor):
 
+class ChemLogPredictor(BasePredictor):
     def __init__(self, model_name: str, **kwargs):
         super().__init__(model_name, **kwargs)
         self.strategy = "algo"
         self.classifier_instances = {
             k: v() for k, v in CLASSIFIERS[self.strategy].items()
         }
-        self.peptide_labels = ["15841", "16670", "24866", "25676", "25696", "25697", "27369", "46761", "47923",
-                               "48030", "48545", "60194", "60334", "60466", "64372", "65061", "90799", "155837"]
-
+        # fmt: off
+        self.peptide_labels = [
+            "15841", "16670", "24866", "25676", "25696", "25697", "27369", "46761", "47923",
+            "48030", "48545", "60194", "60334", "60466", "64372", "65061", "90799", "155837"
+        ]
+        # fmt: on
         print(f"Initialised ChemLog model {self.model_name}")
 
     def predict_smiles_list(self, smiles_list: list[str]) -> list:
@@ -27,9 +26,21 @@ def predict_smiles_list(self, smiles_list: list[str]) -> list:
             if mol is None:
                 results.append(None)
             else:
-                results.append({label: 1 if label in strategy_call(self.strategy, self.classifier_instances, mol)["chebi_classes"] else 0 for label in self.peptide_labels})
+                results.append(
+                    {
+                        label: (
+                            1
+                            if label
+                            in strategy_call(
+                                self.strategy, self.classifier_instances, mol
+                            )["chebi_classes"]
+                            else 0
+                        )
+                        for label in self.peptide_labels
+                    }
+                )
 
         for classifier in self.classifier_instances.values():
             classifier.on_finish()
 
-        return results
+        return results
diff --git a/chebifier/prediction_models/electra_predictor.py b/chebifier/prediction_models/electra_predictor.py
@@ -1,7 +1,8 @@
-from chebifier.prediction_models.nn_predictor import NNPredictor
 from chebai.models.electra import Electra
 from chebai.preprocessing.reader import ChemDataReader
 
+from chebifier.prediction_models.nn_predictor import NNPredictor
+
 
 class ElectraPredictor(NNPredictor):
 
@@ -13,10 +14,10 @@ def init_model(self, ckpt_path: str, **kwargs) -> Electra:
         model = Electra.load_from_checkpoint(
             ckpt_path,
             map_location=self.device,
-            criterion=None, strict=False,
-            metrics=dict(train=dict(), test=dict(), validation=dict()), pretrained_checkpoint=None
+            criterion=None,
+            strict=False,
+            metrics=dict(train=dict(), test=dict(), validation=dict()),
+            pretrained_checkpoint=None,
         )
         model.eval()
         return model
-
-
diff --git a/chebifier/prediction_models/gnn_predictor.py b/chebifier/prediction_models/gnn_predictor.py
@@ -1,16 +1,19 @@
-from chebifier.prediction_models.nn_predictor import NNPredictor
 import chebai_graph.preprocessing.properties as p
 import torch
 from chebai_graph.models.graph import ResGatedGraphConvNetGraphPred
-from chebai_graph.preprocessing.reader import GraphPropertyReader
 from chebai_graph.preprocessing.property_encoder import IndexEncoder, OneHotEncoder
+from chebai_graph.preprocessing.reader import GraphPropertyReader
 from torch_geometric.data.data import Data as GeomData
 
+from chebifier.prediction_models.nn_predictor import NNPredictor
+
 
 class ResGatedPredictor(NNPredictor):
 
     def __init__(self, model_name: str, ckpt_path: str, molecular_properties, **kwargs):
-        super().__init__(model_name, ckpt_path, reader_cls=GraphPropertyReader, **kwargs)
+        super().__init__(
+            model_name, ckpt_path, reader_cls=GraphPropertyReader, **kwargs
+        )
         # molecular_properties is a list of class paths
         if molecular_properties is not None:
             properties = [self.load_class(prop)() for prop in molecular_properties]
@@ -32,11 +35,23 @@ def load_class(self, class_path: str):
 
     def init_model(self, ckpt_path: str, **kwargs) -> ResGatedGraphConvNetGraphPred:
         model = ResGatedGraphConvNetGraphPred.load_from_checkpoint(
-            ckpt_path, map_location=torch.device(self.device), criterion=None, strict=False,
-            metrics=dict(train=dict(), test=dict(), validation=dict()), pretrained_checkpoint=None,
-            config={"in_length": 256, "hidden_length": 512, "dropout_rate": 0.1, "n_conv_layers": 3,
-                    "n_linear_layers": 3, "n_atom_properties": 158, "n_bond_properties": 7,
-                    "n_molecule_properties": 200})
+            ckpt_path,
+            map_location=torch.device(self.device),
+            criterion=None,
+            strict=False,
+            metrics=dict(train=dict(), test=dict(), validation=dict()),
+            pretrained_checkpoint=None,
+            config={
+                "in_length": 256,
+                "hidden_length": 512,
+                "dropout_rate": 0.1,
+                "n_conv_layers": 3,
+                "n_linear_layers": 3,
+                "n_atom_properties": 158,
+                "n_bond_properties": 7,
+                "n_molecule_properties": 200,
+            },
+        )
         model.eval()
         return model
 
@@ -55,14 +70,21 @@ def read_smiles(self, smiles):
                 # use default value if we meet an unseen value
                 if isinstance(prop.encoder, IndexEncoder):
                     if str(value) in prop.encoder.cache:
-                        index = prop.encoder.cache.index(str(value)) + prop.encoder.offset
+                        index = (
+                            prop.encoder.cache.index(str(value)) + prop.encoder.offset
+                        )
                     else:
                         index = 0
-                        print(f"Unknown property value {value} for property {prop} at smiles {smiles}")
+                        print(
+                            f"Unknown property value {value} for property {prop} at smiles {smiles}"
+                        )
                     if isinstance(prop.encoder, OneHotEncoder):
-                        encoded_values.append(torch.nn.functional.one_hot(
-                            torch.tensor(index), num_classes=prop.encoder.get_encoding_length()
-                        ))
+                        encoded_values.append(
+                            torch.nn.functional.one_hot(
+                                torch.tensor(index),
+                                num_classes=prop.encoder.get_encoding_length(),
+                            )
+                        )
                     else:
                         encoded_values.append(torch.tensor([index]))
 
@@ -77,9 +99,7 @@ def read_smiles(self, smiles):
                 if len(encoded_values.size()) == 1:
                     encoded_values = encoded_values.unsqueeze(1)
             else:
-                encoded_values = torch.zeros(
-                    (0, prop.encoder.get_encoding_length())
-                )
+                encoded_values = torch.zeros((0, prop.encoder.get_encoding_length()))
             if isinstance(prop, p.AtomProperty):
                 x = torch.cat([x, encoded_values], dim=1)
             elif isinstance(prop, p.BondProperty):
@@ -93,4 +113,4 @@ def read_smiles(self, smiles):
             edge_attr=edge_attr,
             molecule_attr=molecule_attr,
         )
-        return d
+        return d
diff --git a/chebifier/prediction_models/nn_predictor.py b/chebifier/prediction_models/nn_predictor.py