add huggingface download to cli

sfluegel05 · sfluegel05 · commit f8583cbdfa00 · 2025-07-11T13:08:03.000+02:00
diff --git a/chebifier/__main__.py b/chebifier/__main__.py
@@ -0,0 +1,4 @@
+from chebifier.cli import cli
+
+if __name__ == '__main__':
+    cli()
diff --git a/chebifier/cli.py b/chebifier/cli.py
@@ -1,3 +1,5 @@
+import os
+
 import click
 import yaml
 
@@ -9,14 +11,14 @@ def cli():
     pass
 
 @cli.command()
-@click.argument('config_file', type=click.Path(exists=True))
+@click.option('--config_file', type=click.Path(exists=True), default=os.path.join('configs', 'huggingface_config.yml'), help="Configuration file for ensemble models")
 @click.option('--smiles', '-s', multiple=True, help='SMILES strings to predict')
 @click.option('--smiles-file', '-f', type=click.Path(exists=True), help='File containing SMILES strings (one per line)')
 @click.option('--output', '-o', type=click.Path(), help='Output file to save predictions (optional)')
 @click.option('--ensemble-type', '-e', type=click.Choice(ENSEMBLES.keys()), default='mv', help='Type of ensemble to use (default: Majority Voting)')
 @click.option("--chebi-version", "-v", type=int, default=241, help="ChEBI version to use for checking consistency (default: 241)")
 @click.option("--use-confidence", "-c", is_flag=True, default=True, help="Weight predictions based on how 'confident' a model is in its prediction (default: True)")
-def predict(config_file, smiles, smiles_file, output, ensemble_type, chebi_version):
+def predict(config_file, smiles, smiles_file, output, ensemble_type, chebi_version, use_confidence):
     """Predict ChEBI classes for SMILES strings using an ensemble model.
     
     CONFIG_FILE is the path to a YAML configuration file for the ensemble model.
@@ -39,7 +41,7 @@ def predict(config_file, smiles, smiles_file, output, ensemble_type, chebi_versi
         return
 
     # Make predictions
-    predictions = ensemble.predict_smiles_list(smiles_list)
+    predictions = ensemble.predict_smiles_list(smiles_list, use_confidence=use_confidence)
 
     if output:
         # save as json
diff --git a/chebifier/ensemble/base_ensemble.py b/chebifier/ensemble/base_ensemble.py
@@ -4,6 +4,7 @@
 from chebai.preprocessing.datasets.chebi import ChEBIOver50
 from chebai.result.analyse_sem import PredictionSmoother
 
+from api.hugging_face import download_model_files
 from chebifier.prediction_models.base_predictor import BasePredictor
 
 
@@ -17,14 +18,20 @@ def __init__(self, model_configs: dict, chebi_version: int = 241):
         self.positive_prediction_threshold = 0.5
         for model_name, model_config in model_configs.items():
             model_cls = MODEL_TYPES[model_config["type"]]
-            model_instance = model_cls(model_name, **model_config)
+            if "hugging_face" in model_config:
+                hugging_face_kwargs = download_model_files(model_config["hugging_face"])
+            else:
+                hugging_face_kwargs = {}
+            model_instance = model_cls(model_name, **model_config, **hugging_face_kwargs)
             assert isinstance(model_instance, BasePredictor)
             self.models.append(model_instance)
 
-        self.smoother = PredictionSmoother(ChEBIOver50(chebi_version=chebi_version), disjoint_files=[
+        self.chebi_dataset = ChEBIOver50(chebi_version=chebi_version)
+        self.chebi_dataset._download_required_data()  # download chebi if not already downloaded
+        self.disjoint_files=[
             os.path.join("data", "disjoint_chebi.csv"),
             os.path.join("data", "disjoint_additional.csv")
-        ])
+        ]
 
 
     def gather_predictions(self, smiles_list):
@@ -110,7 +117,7 @@ def calculate_classwise_weights(self, predicted_classes):
 
         return positive_weights, negative_weights
 
-    def predict_smiles_list(self, smiles_list, load_preds_if_possible=True) -> list:
+    def predict_smiles_list(self, smiles_list, load_preds_if_possible=True, **kwargs) -> list:
         preds_file = f"predictions_by_model_{'_'.join(model.model_name for model in self.models)}.pt"
         predicted_classes_file = f"predicted_classes_{'_'.join(model.model_name for model in self.models)}.txt"
         if not load_preds_if_possible or not os.path.isfile(preds_file):
@@ -128,11 +135,12 @@ def predict_smiles_list(self, smiles_list, load_preds_if_possible=True) -> list:
                 predicted_classes = {line.strip(): i for i, line in enumerate(f.readlines())}
 
         classwise_weights = self.calculate_classwise_weights(predicted_classes)
-        class_decisions = self.consolidate_predictions(ordered_predictions, classwise_weights)
+        class_decisions = self.consolidate_predictions(ordered_predictions, classwise_weights, **kwargs)
         # Smooth predictions
         class_names = list(predicted_classes.keys())
-        self.smoother.label_names = class_names
-        class_decisions = self.smoother(class_decisions)
+        # initialise new smoother class since we don't know the labels beforehand (this could be more efficient)
+        new_smoother = PredictionSmoother(self.chebi_dataset, label_names=class_names, disjoint_files=self.disjoint_files)
+        class_decisions = new_smoother(class_decisions)
 
         class_names = list(predicted_classes.keys())
         class_indices = {predicted_classes[cls]: cls for cls in class_names}
diff --git a/chebifier/model_registry.py b/chebifier/model_registry.py
@@ -10,9 +10,9 @@
 )
 
 ENSEMBLES = {
-    "en_mv": BaseEnsemble,
-    "en_wmv-ppvnpv": WMVwithPPVNPVEnsemble,
-    "en_wmv-f1": WMVwithF1Ensemble,
+    "mv": BaseEnsemble,
+    "wmv-ppvnpv": WMVwithPPVNPVEnsemble,
+    "wmv-f1": WMVwithF1Ensemble,
 }
 
 
diff --git a/configs/huggingface_config.yml b/configs/huggingface_config.yml
@@ -0,0 +1,22 @@
+
+chemlog_peptides:
+    type: chemlog
+    model_weight: 100
+
+#resgated_huggingface:
+#  type: resgated
+#  hugging_face:
+#    repo_id: aditya0by0/python-chebifier
+#    subfolder: resgated
+#    files:
+#      ckpt: resgated.ckpt
+#      labels: classes.txt
+
+electra_huggingface:
+  type: electra
+  hugging_face:
+    repo_id: aditya0by0/python-chebifier
+    subfolder: electra
+    files:
+      ckpt: electra.ckpt
+      labels: classes.txt
diff --git a/pyproject.toml b/pyproject.toml
@@ -27,9 +27,6 @@ dependencies = [
     "chemlog>=1.0.4"
 ]
 
-[project.scripts]
-chebifier = "chebifier.cli:cli"
-
 
 [tool.setuptools]
 packages = ["chebifier", "chebifier.ensemble", "chebifier.prediction_models"]

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +from chebifier.cli import cli
++
 +if __name__ == '__main__':
 +    cli()
Original file line number	Diff line number	Diff line change
`@@ -10,9 +10,9 @@`
`10`	`10`	`)`
`11`	`11`
`12`	`12`	`ENSEMBLES = {`
`13`		`- "en_mv": BaseEnsemble,`
`14`		`- "en_wmv-ppvnpv": WMVwithPPVNPVEnsemble,`
`15`		`- "en_wmv-f1": WMVwithF1Ensemble,`
	`13`	`+ "mv": BaseEnsemble,`
	`14`	`+ "wmv-ppvnpv": WMVwithPPVNPVEnsemble,`
	`15`	`+ "wmv-f1": WMVwithF1Ensemble,`
`16`	`16`	`}`
`17`	`17`
`18`	`18`
Original file line number	Diff line number	Diff line change
`@@ -27,9 +27,6 @@ dependencies = [`
`27`	`27`	`"chemlog>=1.0.4"`
`28`	`28`	`]`
`29`	`29`
`30`		`-[project.scripts]`
`31`		`-chebifier = "chebifier.cli:cli"`
`32`		`-`
`33`	`30`
`34`	`31`	`[tool.setuptools]`
`35`	`32`	`packages = ["chebifier", "chebifier.ensemble", "chebifier.prediction_models"]`