ChEB-AI
diff --git a/‎api/__init__.py‎ b/‎api/__init__.py‎
diff --git a/‎api/__main__.py‎
Lines changed: 0 additions & 10 deletions b/‎api/__main__.py‎
Lines changed: 0 additions & 10 deletions
diff --git a/‎api/api_registry.yml‎
Lines changed: 0 additions & 24 deletions b/‎api/api_registry.yml‎
Lines changed: 0 additions & 24 deletions
diff --git a/‎api/cli.py‎
Lines changed: 0 additions & 121 deletions b/‎api/cli.py‎
Lines changed: 0 additions & 121 deletions
diff --git a/‎api/check_env.py‎ ‎chebifier/check_env.py‎api/check_env.py renamed to chebifier/check_env.py b/‎api/check_env.py‎ ‎chebifier/check_env.py‎api/check_env.py renamed to chebifier/check_env.py
diff --git a/‎chebifier/cli.py‎
Lines changed: 42 additions & 12 deletions b/‎chebifier/cli.py‎
Lines changed: 42 additions & 12 deletions
diff --git a/‎chebifier/ensemble.yml‎
Lines changed: 15 additions & 0 deletions b/‎chebifier/ensemble.yml‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎chebifier/ensemble/base_ensemble.py‎
Lines changed: 34 additions & 16 deletions b/‎chebifier/ensemble/base_ensemble.py‎
Lines changed: 34 additions & 16 deletions
diff --git a/‎api/hugging_face.py‎ ‎chebifier/hugging_face.py‎api/hugging_face.py renamed to chebifier/hugging_face.py
Lines changed: 5 additions & 6 deletions b/‎api/hugging_face.py‎ ‎chebifier/hugging_face.py‎api/hugging_face.py renamed to chebifier/hugging_face.py
Lines changed: 5 additions & 6 deletions
@@ -1,3 +1,4 @@
+import importlib.resources
 import os
 
 import click
@@ -14,9 +15,10 @@ def cli():
 
 @cli.command()
 @click.option(
-    "--config_file",
+    "--ensemble-config",
+    "-e",
     type=click.Path(exists=True),
-    default=os.path.join("configs", "huggingface_config.yml"),
+    default=None,
     help="Configuration file for ensemble models",
 )
 @click.option("--smiles", "-s", multiple=True, help="SMILES strings to predict")
@@ -34,10 +36,10 @@ def cli():
 )
 @click.option(
     "--ensemble-type",
-    "-e",
+    "-t",
     type=click.Choice(ENSEMBLES.keys()),
-    default="mv",
-    help="Type of ensemble to use (default: Majority Voting)",
+    default="wmv-f1",
+    help="Type of ensemble to use (default: Weighted Majority Voting)",
 )
 @click.option(
     "--chebi-version",
@@ -53,25 +55,53 @@ def cli():
     default=True,
     help="Weight predictions based on how 'confident' a model is in its prediction (default: True)",
 )
+@click.option(
+    "--resolve-inconsistencies",
+    "-r",
+    is_flag=True,
+    default=True,
+    help="Resolve inconsistencies in predictions automatically (default: True)",
+)
 def predict(
-    config_file,
+    ensemble_config,
     smiles,
     smiles_file,
     output,
     ensemble_type,
     chebi_version,
     use_confidence,
+    resolve_inconsistencies=True,
 ):
     """Predict ChEBI classes for SMILES strings using an ensemble model.
-
-    CONFIG_FILE is the path to a YAML configuration file for the ensemble model.
-    """
+        """
     # Load configuration from YAML file
-    with open(config_file, "r") as f:
-        config = yaml.safe_load(f)
+    if not ensemble_config:
+        print(f"Using default ensemble configuration")
+        with importlib.resources.files("chebifier").joinpath("ensemble.yml").open("r") as f:
+            config = yaml.safe_load(f)
+    else:
+        print(f"Loading ensemble configuration from {ensemble_config}")
+        with open(ensemble_config, "r") as f:
+            config = yaml.safe_load(f)
+
+    with importlib.resources.files("chebifier").joinpath("model_registry.yml").open("r") as f:
+        model_registry = yaml.safe_load(f)
+
+    new_config = {}
+    for model_name, entry in config.items():
+        if "load_model" in entry:
+            if entry["load_model"] not in model_registry:
+                raise ValueError(
+                    f"Model {entry['load_model']} not found in model registry. "
+                    f"Available models are: {','.join(model_registry.keys())}."
+                )
+            new_config[model_name] = {**model_registry[entry["load_model"]], **entry}
+        else:
+            new_config[model_name] = entry
+    config = new_config
 
     # Instantiate ensemble model
-    ensemble = ENSEMBLES[ensemble_type](config, chebi_version=chebi_version)
+    ensemble = ENSEMBLES[ensemble_type](config, chebi_version=chebi_version, resolve_inconsistencies=resolve_inconsistencies)
 
     # Collect SMILES strings from arguments and/or file
     smiles_list = list(smiles)
 
@@ -0,0 +1,15 @@
+electra:
+    load_model: electra_chebi50_v241
+resgated:
+    load_model: resgated_chebi50_v241
+chemlog_peptides:
+    type: chemlog_peptides
+    model_weight: 100
+chemlog_element:
+    type: chemlog_element
+    model_weight: 100
+chemlog_organox:
+    type: chemlog_organox
+    model_weight: 100
+c3p:
+    load_model: c3p_with_weights
@@ -6,45 +6,63 @@
 from chebai.preprocessing.datasets.chebi import ChEBIOver50
 from chebai.result.analyse_sem import PredictionSmoother, get_chebi_graph
 
+from chebifier.check_env import check_package_installed
 from chebifier.prediction_models.base_predictor import BasePredictor
-from functools import lru_cache
+
 
 class BaseEnsemble:
 
-    def __init__(self, model_configs: dict, chebi_version: int = 241):
+    def __init__(self, model_configs: dict, chebi_version: int = 241, resolve_inconsistencies: bool = True):
         # Deferred Import: To avoid circular import error
         from chebifier.model_registry import MODEL_TYPES
 
         self.chebi_dataset = ChEBIOver50(chebi_version=chebi_version)
         self.chebi_dataset._download_required_data()  # download chebi if not already downloaded
         self.chebi_graph = get_chebi_graph(self.chebi_dataset, None)
-        self.disjoint_files = [
+        local_disjoint_files = [
             os.path.join("data", "disjoint_chebi.csv"),
             os.path.join("data", "disjoint_additional.csv"),
         ]
+        self.disjoint_files = []
+        for file in local_disjoint_files:
+            if os.path.isfile(file):
+                self.disjoint_files.append(file)
+            else:
+                print(f"Disjoint axiom file {file} not found. Loading from huggingface instead...")
+                from chebifier.hugging_face import download_model_files
+                self.disjoint_files.append(download_model_files({
+                        "repo_id": "chebai/chebifier",
+                        "repo_type": "dataset",
+                        "files": {"disjoint_file": os.path.basename(file)},
+                })["disjoint_file"])
 
         self.models = []
         self.positive_prediction_threshold = 0.5
         for model_name, model_config in model_configs.items():
             model_cls = MODEL_TYPES[model_config["type"]]
             if "hugging_face" in model_config:
-                from api.hugging_face import download_model_files
+                from chebifier.hugging_face import download_model_files
                 hugging_face_kwargs = download_model_files(model_config["hugging_face"])
             else:
                 hugging_face_kwargs = {}
+            if "package_name" in model_config:
+                check_package_installed(model_config["package_name"])
+
             model_instance = model_cls(
                 model_name, **model_config, **hugging_face_kwargs, chebi_graph=self.chebi_graph
             )
             assert isinstance(model_instance, BasePredictor)
             self.models.append(model_instance)
 
 
-
-        self.smoother = PredictionSmoother(
-            self.chebi_dataset,
-            label_names=None,
-            disjoint_files=self.disjoint_files,
-        )
+        if resolve_inconsistencies:
+            self.smoother = PredictionSmoother(
+                self.chebi_dataset,
+                label_names=None,
+                disjoint_files=self.disjoint_files,
+            )
+        else:
+            self.smoother = None
 
     def gather_predictions(self, smiles_list):
         # get predictions from all models for the SMILES list
@@ -131,15 +149,15 @@ def consolidate_predictions(self, predictions, classwise_weights, predicted_clas
         # Smooth predictions
         start_time = time.perf_counter()
         class_names = list(predicted_classes.keys())
-        self.smoother.set_label_names(class_names)
-        smooth_net_score = self.smoother(net_score)
+        if self.smoother is not None:
+            self.smoother.set_label_names(class_names)
+            smooth_net_score = self.smoother(net_score)
+            class_decisions = (smooth_net_score > 0.5) & has_valid_predictions  # Shape: (num_smiles, num_classes)
+        else:
+            class_decisions = (net_score > 0) & has_valid_predictions  # Shape: (num_smiles, num_classes)
         end_time = time.perf_counter()
         print(f"Prediction smoothing took {end_time - start_time:.2f} seconds")
 
-        class_decisions = (
-            smooth_net_score > 0.5
-        ) & has_valid_predictions # Shape: (num_smiles, num_classes)
-
         complete_failure = torch.all(~has_valid_predictions, dim=1)
         return class_decisions, complete_failure
 
 
@@ -25,27 +25,26 @@ def download_model_files(
         model_config (Dict[str, str | Dict[str, str]]): A dictionary containing:
             - 'repo_id' (str): The Hugging Face repository ID (e.g., 'username/modelname').
             - 'subfolder' (str): The subfolder within the repo where the files are located.
-            - 'files' (Dict[str, str]): A mapping from file type (e.g., 'ckpt', 'labels') to
+            - 'files' (Dict[str, str]): A mapping from file type (e.g., 'ckpt_path', 'target_labels_path') to
               actual file names (e.g., 'electra.ckpt', 'classes.txt').
 
     Returns:
         Dict[str, Path]: A dictionary mapping each file type to the local Path of the downloaded file.
     """
     repo_id = model_config["repo_id"]
-    subfolder = model_config["subfolder"]
+    subfolder = model_config.get("subfolder", None)
+    repo_type = model_config.get("repo_type", "model")
     filenames = model_config["files"]
 
     local_paths: dict[str, Path] = {}
     for file_type, filename in filenames.items():
         downloaded_file_path = hf_hub_download(
             repo_id=repo_id,
             filename=filename,
+            repo_type=repo_type,
             subfolder=subfolder,
         )
         local_paths[file_type] = Path(downloaded_file_path)
         print(f"\t Using file `{filename}` from: {downloaded_file_path}")
 
-    return {
-        "ckpt_path": local_paths["ckpt"],
-        "target_labels_path": local_paths["labels"],
-    }
+    return local_paths