reformat with black

sfluegel05 · sfluegel05 · commit 90aedd43a105 · 2025-07-11T13:11:10.000+02:00
diff --git a/chebifier/__main__.py b/chebifier/__main__.py
@@ -1,4 +1,4 @@
 from chebifier.cli import cli
 
-if __name__ == '__main__':
-    cli()
+if __name__ == "__main__":
+    cli()
diff --git a/chebifier/cli.py b/chebifier/cli.py
@@ -5,49 +5,99 @@
 
 from .model_registry import ENSEMBLES
 
+
 @click.group()
 def cli():
     """Command line interface for Chebifier."""
     pass
 
+
 @cli.command()
-@click.option('--config_file', type=click.Path(exists=True), default=os.path.join('configs', 'huggingface_config.yml'), help="Configuration file for ensemble models")
-@click.option('--smiles', '-s', multiple=True, help='SMILES strings to predict')
-@click.option('--smiles-file', '-f', type=click.Path(exists=True), help='File containing SMILES strings (one per line)')
-@click.option('--output', '-o', type=click.Path(), help='Output file to save predictions (optional)')
-@click.option('--ensemble-type', '-e', type=click.Choice(ENSEMBLES.keys()), default='mv', help='Type of ensemble to use (default: Majority Voting)')
-@click.option("--chebi-version", "-v", type=int, default=241, help="ChEBI version to use for checking consistency (default: 241)")
-@click.option("--use-confidence", "-c", is_flag=True, default=True, help="Weight predictions based on how 'confident' a model is in its prediction (default: True)")
-def predict(config_file, smiles, smiles_file, output, ensemble_type, chebi_version, use_confidence):
+@click.option(
+    "--config_file",
+    type=click.Path(exists=True),
+    default=os.path.join("configs", "huggingface_config.yml"),
+    help="Configuration file for ensemble models",
+)
+@click.option("--smiles", "-s", multiple=True, help="SMILES strings to predict")
+@click.option(
+    "--smiles-file",
+    "-f",
+    type=click.Path(exists=True),
+    help="File containing SMILES strings (one per line)",
+)
+@click.option(
+    "--output",
+    "-o",
+    type=click.Path(),
+    help="Output file to save predictions (optional)",
+)
+@click.option(
+    "--ensemble-type",
+    "-e",
+    type=click.Choice(ENSEMBLES.keys()),
+    default="mv",
+    help="Type of ensemble to use (default: Majority Voting)",
+)
+@click.option(
+    "--chebi-version",
+    "-v",
+    type=int,
+    default=241,
+    help="ChEBI version to use for checking consistency (default: 241)",
+)
+@click.option(
+    "--use-confidence",
+    "-c",
+    is_flag=True,
+    default=True,
+    help="Weight predictions based on how 'confident' a model is in its prediction (default: True)",
+)
+def predict(
+    config_file,
+    smiles,
+    smiles_file,
+    output,
+    ensemble_type,
+    chebi_version,
+    use_confidence,
+):
     """Predict ChEBI classes for SMILES strings using an ensemble model.
-    
+
     CONFIG_FILE is the path to a YAML configuration file for the ensemble model.
     """
     # Load configuration from YAML file
-    with open(config_file, 'r') as f:
+    with open(config_file, "r") as f:
         config = yaml.safe_load(f)
-    
+
     # Instantiate ensemble model
     ensemble = ENSEMBLES[ensemble_type](config, chebi_version=chebi_version)
-    
+
     # Collect SMILES strings from arguments and/or file
     smiles_list = list(smiles)
     if smiles_file:
-        with open(smiles_file, 'r') as f:
+        with open(smiles_file, "r") as f:
             smiles_list.extend([line.strip() for line in f if line.strip()])
-    
+
     if not smiles_list:
         click.echo("No SMILES strings provided. Use --smiles or --smiles-file options.")
         return
 
     # Make predictions
-    predictions = ensemble.predict_smiles_list(smiles_list, use_confidence=use_confidence)
+    predictions = ensemble.predict_smiles_list(
+        smiles_list, use_confidence=use_confidence
+    )
 
     if output:
         # save as json
         import json
-        with open(output, 'w') as f:
-            json.dump({smiles: pred for smiles, pred in zip(smiles_list, predictions)}, f, indent=2)
+
+        with open(output, "w") as f:
+            json.dump(
+                {smiles: pred for smiles, pred in zip(smiles_list, predictions)},
+                f,
+                indent=2,
+            )
 
     else:
         # Print results
@@ -59,5 +109,5 @@ def predict(config_file, smiles, smiles_file, output, ensemble_type, chebi_versi
                 click.echo("  No predictions")
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     cli()
diff --git a/chebifier/ensemble/base_ensemble.py b/chebifier/ensemble/base_ensemble.py
@@ -22,18 +22,19 @@ def __init__(self, model_configs: dict, chebi_version: int = 241):
                 hugging_face_kwargs = download_model_files(model_config["hugging_face"])
             else:
                 hugging_face_kwargs = {}
-            model_instance = model_cls(model_name, **model_config, **hugging_face_kwargs)
+            model_instance = model_cls(
+                model_name, **model_config, **hugging_face_kwargs
+            )
             assert isinstance(model_instance, BasePredictor)
             self.models.append(model_instance)
 
         self.chebi_dataset = ChEBIOver50(chebi_version=chebi_version)
         self.chebi_dataset._download_required_data()  # download chebi if not already downloaded
-        self.disjoint_files=[
+        self.disjoint_files = [
             os.path.join("data", "disjoint_chebi.csv"),
-            os.path.join("data", "disjoint_additional.csv")
+            os.path.join("data", "disjoint_additional.csv"),
         ]
 
-
     def gather_predictions(self, smiles_list):
         # get predictions from all models for the SMILES list
         # order them by alphabetically by label class
@@ -60,11 +61,12 @@ def gather_predictions(self, smiles_list):
             ):
                 if logits_for_smiles is not None:
                     for cls in logits_for_smiles:
-                        ordered_logits[j, predicted_classes_dict[cls], i] = logits_for_smiles[cls]
+                        ordered_logits[j, predicted_classes_dict[cls], i] = (
+                            logits_for_smiles[cls]
+                        )
 
         return ordered_logits, predicted_classes
 
-
     def consolidate_predictions(self, predictions, classwise_weights, **kwargs):
         """
         Aggregates predictions from multiple models using weighted majority voting.
@@ -80,11 +82,17 @@ def consolidate_predictions(self, predictions, classwise_weights, **kwargs):
         has_valid_predictions = valid_counts > 0
 
         # Calculate positive and negative predictions for all classes at once
-        positive_mask = (predictions > self.positive_prediction_threshold) & valid_predictions
-        negative_mask = (predictions < self.positive_prediction_threshold) & valid_predictions
+        positive_mask = (
+            predictions > self.positive_prediction_threshold
+        ) & valid_predictions
+        negative_mask = (
+            predictions < self.positive_prediction_threshold
+        ) & valid_predictions
 
         if "use_confidence" in kwargs and kwargs["use_confidence"]:
-            confidence = 2 * torch.abs(predictions.nan_to_num() - self.positive_prediction_threshold)
+            confidence = 2 * torch.abs(
+                predictions.nan_to_num() - self.positive_prediction_threshold
+            )
         else:
             confidence = torch.ones_like(predictions)
 
@@ -95,18 +103,22 @@ def consolidate_predictions(self, predictions, classwise_weights, **kwargs):
         # Calculate weighted predictions using broadcasting
         # predictions shape: (num_smiles, num_classes, num_models)
         # weights shape: (num_classes, num_models)
-        positive_weighted = positive_mask.float() * confidence * pos_weights.unsqueeze(0)
-        negative_weighted = negative_mask.float() * confidence * neg_weights.unsqueeze(0)
+        positive_weighted = (
+            positive_mask.float() * confidence * pos_weights.unsqueeze(0)
+        )
+        negative_weighted = (
+            negative_mask.float() * confidence * neg_weights.unsqueeze(0)
+        )
 
         # Sum over models dimension
         positive_sum = positive_weighted.sum(dim=2)  # Shape: (num_smiles, num_classes)
         negative_sum = negative_weighted.sum(dim=2)  # Shape: (num_smiles, num_classes)
 
         # Determine which classes to include for each SMILES
         net_score = positive_sum - negative_sum  # Shape: (num_smiles, num_classes)
-        class_decisions = (net_score > 0) & has_valid_predictions  # Shape: (num_smiles, num_classes)
-
-
+        class_decisions = (
+            net_score > 0
+        ) & has_valid_predictions  # Shape: (num_smiles, num_classes)
 
         return class_decisions
 
@@ -117,29 +129,43 @@ def calculate_classwise_weights(self, predicted_classes):
 
         return positive_weights, negative_weights
 
-    def predict_smiles_list(self, smiles_list, load_preds_if_possible=True, **kwargs) -> list:
+    def predict_smiles_list(
+        self, smiles_list, load_preds_if_possible=True, **kwargs
+    ) -> list:
         preds_file = f"predictions_by_model_{'_'.join(model.model_name for model in self.models)}.pt"
         predicted_classes_file = f"predicted_classes_{'_'.join(model.model_name for model in self.models)}.txt"
         if not load_preds_if_possible or not os.path.isfile(preds_file):
-            ordered_predictions, predicted_classes = self.gather_predictions(smiles_list)
+            ordered_predictions, predicted_classes = self.gather_predictions(
+                smiles_list
+            )
             # save predictions
             torch.save(ordered_predictions, preds_file)
             with open(predicted_classes_file, "w") as f:
                 for cls in predicted_classes:
                     f.write(f"{cls}\n")
             predicted_classes = {cls: i for i, cls in enumerate(predicted_classes)}
         else:
-            print(f"Loading predictions from {preds_file} and label indexes from {predicted_classes_file}")
+            print(
+                f"Loading predictions from {preds_file} and label indexes from {predicted_classes_file}"
+            )
             ordered_predictions = torch.load(preds_file)
             with open(predicted_classes_file, "r") as f:
-                predicted_classes = {line.strip(): i for i, line in enumerate(f.readlines())}
+                predicted_classes = {
+                    line.strip(): i for i, line in enumerate(f.readlines())
+                }
 
         classwise_weights = self.calculate_classwise_weights(predicted_classes)
-        class_decisions = self.consolidate_predictions(ordered_predictions, classwise_weights, **kwargs)
+        class_decisions = self.consolidate_predictions(
+            ordered_predictions, classwise_weights, **kwargs
+        )
         # Smooth predictions
         class_names = list(predicted_classes.keys())
         # initialise new smoother class since we don't know the labels beforehand (this could be more efficient)
-        new_smoother = PredictionSmoother(self.chebi_dataset, label_names=class_names, disjoint_files=self.disjoint_files)
+        new_smoother = PredictionSmoother(
+            self.chebi_dataset,
+            label_names=class_names,
+            disjoint_files=self.disjoint_files,
+        )
         class_decisions = new_smoother(class_decisions)
 
         class_names = list(predicted_classes.keys())
@@ -153,31 +179,36 @@ def predict_smiles_list(self, smiles_list, load_preds_if_possible=True, **kwargs
 
 
 if __name__ == "__main__":
-    ensemble = BaseEnsemble({"resgated_0ps1g189":{
-  "type": "resgated",
-  "ckpt_path": "data/0ps1g189/epoch=122.ckpt",
-  "target_labels_path": "data/chebi_v241/ChEBI50/processed/classes.txt",
- "molecular_properties": [
-      "chebai_graph.preprocessing.properties.AtomType",
-      "chebai_graph.preprocessing.properties.NumAtomBonds",
-      "chebai_graph.preprocessing.properties.AtomCharge",
-      "chebai_graph.preprocessing.properties.AtomAromaticity",
-      "chebai_graph.preprocessing.properties.AtomHybridization",
-      "chebai_graph.preprocessing.properties.AtomNumHs",
-      "chebai_graph.preprocessing.properties.BondType",
-      "chebai_graph.preprocessing.properties.BondInRing",
-      "chebai_graph.preprocessing.properties.BondAromaticity",
-      "chebai_graph.preprocessing.properties.RDKit2DNormalized",
-    ],
-  #"classwise_weights_path" : "../python-chebai/metrics_0ps1g189_80-10-10.json"
-    },
-
-"electra_14ko0zcf": {
-  "type": "electra",
-  "ckpt_path": "data/14ko0zcf/epoch=193.ckpt",
-  "target_labels_path": "data/chebi_v241/ChEBI50/processed/classes.txt",
-  #"classwise_weights_path": "../python-chebai/metrics_electra_14ko0zcf_80-10-10.json",
-}
-    })
-    r = ensemble.predict_smiles_list(["[NH3+]CCCC[C@H](NC(=O)[C@@H]([NH3+])CC([O-])=O)C([O-])=O"], load_preds_if_possible=False)
+    ensemble = BaseEnsemble(
+        {
+            "resgated_0ps1g189": {
+                "type": "resgated",
+                "ckpt_path": "data/0ps1g189/epoch=122.ckpt",
+                "target_labels_path": "data/chebi_v241/ChEBI50/processed/classes.txt",
+                "molecular_properties": [
+                    "chebai_graph.preprocessing.properties.AtomType",
+                    "chebai_graph.preprocessing.properties.NumAtomBonds",
+                    "chebai_graph.preprocessing.properties.AtomCharge",
+                    "chebai_graph.preprocessing.properties.AtomAromaticity",
+                    "chebai_graph.preprocessing.properties.AtomHybridization",
+                    "chebai_graph.preprocessing.properties.AtomNumHs",
+                    "chebai_graph.preprocessing.properties.BondType",
+                    "chebai_graph.preprocessing.properties.BondInRing",
+                    "chebai_graph.preprocessing.properties.BondAromaticity",
+                    "chebai_graph.preprocessing.properties.RDKit2DNormalized",
+                ],
+                # "classwise_weights_path" : "../python-chebai/metrics_0ps1g189_80-10-10.json"
+            },
+            "electra_14ko0zcf": {
+                "type": "electra",
+                "ckpt_path": "data/14ko0zcf/epoch=193.ckpt",
+                "target_labels_path": "data/chebi_v241/ChEBI50/processed/classes.txt",
+                # "classwise_weights_path": "../python-chebai/metrics_electra_14ko0zcf_80-10-10.json",
+            },
+        }
+    )
+    r = ensemble.predict_smiles_list(
+        ["[NH3+]CCCC[C@H](NC(=O)[C@@H]([NH3+])CC([O-])=O)C([O-])=O"],
+        load_preds_if_possible=False,
+    )
     print(len(r), r[0])