use class scores for smoothing, explicitly predict transitive closure for all models

sfluegel05 · sfluegel05 · commit e8e4ec35a1c9 · 2025-07-18T19:25:01.000+02:00
diff --git a/chebifier/ensemble/base_ensemble.py b/chebifier/ensemble/base_ensemble.py
@@ -4,7 +4,7 @@
 import torch
 import tqdm
 from chebai.preprocessing.datasets.chebi import ChEBIOver50
-from chebai.result.analyse_sem import PredictionSmoother
+from chebai.result.analyse_sem import PredictionSmoother, get_chebi_graph
 
 from chebifier.prediction_models.base_predictor import BasePredictor
 
@@ -15,6 +15,14 @@ def __init__(self, model_configs: dict, chebi_version: int = 241):
         # Deferred Import: To avoid circular import error
         from chebifier.model_registry import MODEL_TYPES
 
+        self.chebi_dataset = ChEBIOver50(chebi_version=chebi_version)
+        self.chebi_dataset._download_required_data()  # download chebi if not already downloaded
+        self.chebi_graph = get_chebi_graph(self.chebi_dataset, None)
+        self.disjoint_files = [
+            os.path.join("data", "disjoint_chebi.csv"),
+            os.path.join("data", "disjoint_additional.csv"),
+        ]
+
         self.models = []
         self.positive_prediction_threshold = 0.5
         for model_name, model_config in model_configs.items():
@@ -25,17 +33,12 @@ def __init__(self, model_configs: dict, chebi_version: int = 241):
             else:
                 hugging_face_kwargs = {}
             model_instance = model_cls(
-                model_name, **model_config, **hugging_face_kwargs
+                model_name, **model_config, **hugging_face_kwargs, chebi_graph=self.chebi_graph
             )
             assert isinstance(model_instance, BasePredictor)
             self.models.append(model_instance)
 
-        self.chebi_dataset = ChEBIOver50(chebi_version=chebi_version)
-        self.chebi_dataset._download_required_data()  # download chebi if not already downloaded
-        self.disjoint_files = [
-            os.path.join("data", "disjoint_chebi.csv"),
-            os.path.join("data", "disjoint_additional.csv"),
-        ]
+
 
         self.smoother = PredictionSmoother(
             self.chebi_dataset,
@@ -54,7 +57,7 @@ def gather_predictions(self, smiles_list):
                 if logits_for_smiles is not None:
                     for cls in logits_for_smiles:
                         predicted_classes.add(cls)
-        print("Sorting predictions...")
+        print(f"Sorting predictions from {len(model_predictions)} models...")
         predicted_classes = sorted(list(predicted_classes))
         predicted_classes_dict = {cls: i for i, cls in enumerate(predicted_classes)}
         ordered_logits = (
@@ -75,7 +78,7 @@ def gather_predictions(self, smiles_list):
 
         return ordered_logits, predicted_classes
 
-    def consolidate_predictions(self, predictions, classwise_weights, **kwargs):
+    def consolidate_predictions(self, predictions, classwise_weights, predicted_classes, **kwargs):
         """
         Aggregates predictions from multiple models using weighted majority voting.
         Optimized version using tensor operations instead of for loops.
@@ -124,8 +127,17 @@ def consolidate_predictions(self, predictions, classwise_weights, **kwargs):
 
         # Determine which classes to include for each SMILES
         net_score = positive_sum - negative_sum  # Shape: (num_smiles, num_classes)
+
+        # Smooth predictions
+        start_time = time.perf_counter()
+        class_names = list(predicted_classes.keys())
+        self.smoother.set_label_names(class_names)
+        smooth_net_score = self.smoother(net_score)
+        end_time = time.perf_counter()
+        print(f"Prediction smoothing took {end_time - start_time:.2f} seconds")
+
         class_decisions = (
-            net_score > 0
+            smooth_net_score > 0.5
         ) & has_valid_predictions # Shape: (num_smiles, num_classes)
 
         complete_failure = torch.all(~has_valid_predictions, dim=1)
@@ -139,14 +151,16 @@ def calculate_classwise_weights(self, predicted_classes):
         return positive_weights, negative_weights
 
     def predict_smiles_list(
-        self, smiles_list, load_preds_if_possible=True, **kwargs
+        self, smiles_list, load_preds_if_possible=False, **kwargs
     ) -> list:
         preds_file = f"predictions_by_model_{'_'.join(model.model_name for model in self.models)}.pt"
         predicted_classes_file = f"predicted_classes_{'_'.join(model.model_name for model in self.models)}.txt"
         if not load_preds_if_possible or not os.path.isfile(preds_file):
             ordered_predictions, predicted_classes = self.gather_predictions(
                 smiles_list
             )
+            if len(predicted_classes) == 0:
+                print(f"Warning: No classes have been predicted for the given SMILES list.")
             # save predictions
             torch.save(ordered_predictions, preds_file)
             with open(predicted_classes_file, "w") as f:
@@ -165,15 +179,8 @@ def predict_smiles_list(
 
         classwise_weights = self.calculate_classwise_weights(predicted_classes)
         class_decisions, is_failure = self.consolidate_predictions(
-            ordered_predictions, classwise_weights, **kwargs
+            ordered_predictions, classwise_weights, predicted_classes, **kwargs
         )
-        # Smooth predictions
-        start_time = time.perf_counter()
-        class_names = list(predicted_classes.keys())
-        self.smoother.set_label_names(class_names)
-        class_decisions = self.smoother(class_decisions)
-        end_time = time.perf_counter()
-        print(f"Prediction smoothing took {end_time - start_time:.2f} seconds")
 
         class_names = list(predicted_classes.keys())
         class_indices = {predicted_classes[cls]: cls for cls in class_names}
diff --git a/chebifier/prediction_models/c3p_predictor.py b/chebifier/prediction_models/c3p_predictor.py
@@ -15,13 +15,15 @@ def __init__(self, model_name: str, program_directory: Optional[Path]=None, chem
         super().__init__(model_name, **kwargs)
         self.program_directory = program_directory
         self.chemical_classes = chemical_classes
+        self.chebi_graph = kwargs.get("chebi_graph", None)
 
     def predict_smiles_list(self, smiles_list: list[str]) -> list:
-        result_list = c3p_classifier.classify(smiles_list, self.program_directory, self.chemical_classes, strict=False)
+        result_list = c3p_classifier.classify(smiles_list, self.program_directory, self.chemical_classes, strict=True)
         result_reformatted = [dict() for _ in range(len(smiles_list))]
         for result in result_list:
-            result_reformatted[smiles_list.index(result.input_smiles)][result.class_id.split(":")[1]] = result.is_match
-        print(f"C3P predictions for {len(smiles_list)} SMILES strings:")
-        for i, smiles in enumerate(smiles_list):
-            print(f"{smiles}: {result_reformatted[i]}")
+            chebi_id = result.class_id.split(":")[1]
+            result_reformatted[smiles_list.index(result.input_smiles)][chebi_id] = result.is_match
+            if result.is_match and self.chebi_graph is not None:
+                for parent in list(self.chebi_graph.predecessors(int(chebi_id))):
+                    result_reformatted[smiles_list.index(result.input_smiles)][str(parent)] = 1
         return result_reformatted
diff --git a/chebifier/prediction_models/chebi_lookup.py b/chebifier/prediction_models/chebi_lookup.py
@@ -10,6 +10,14 @@ def __init__(self, model_name: str, description: str = None, chebi_version: int
         super().__init__(model_name, **kwargs)
         self._description = description or "ChEBI Lookup: If the SMILES is equivalent to a ChEBI entry, retrieve the classification of that entry."
         self.chebi_version = chebi_version
+        self.chebi_graph = kwargs.get("chebi_graph", None)
+        if self.chebi_graph is None:
+            from chebai.preprocessing.datasets.chebi import ChEBIOver50
+            self.chebi_dataset = ChEBIOver50(chebi_version=self.chebi_version)
+            self.chebi_dataset._download_required_data()
+            self.chebi_graph = self.chebi_dataset._extract_class_hierarchy(
+                os.path.join(self.chebi_dataset.raw_dir, "chebi.obo")
+            )
         self.lookup_table = self.get_smiles_lookup()
 
     def get_smiles_lookup(self):
@@ -26,15 +34,8 @@ def get_smiles_lookup(self):
 
 
     def build_smiles_lookup(self):
-        # todo test
-        from chebai.preprocessing.datasets.chebi import ChEBIOver50
-        self.chebi_dataset = ChEBIOver50(chebi_version=self.chebi_version)
-        self.chebi_dataset._download_required_data()
-        chebi_graph = self.chebi_dataset._extract_class_hierarchy(
-            os.path.join(self.chebi_dataset.raw_dir, "chebi.obo")
-        )
         smiles_lookup = dict()
-        for chebi_id, smiles in nx.get_node_attributes(chebi_graph, "smiles").items():
+        for chebi_id, smiles in nx.get_node_attributes(self.chebi_graph, "smiles").items():
             if smiles is not None:
                 try:
                     mol = Chem.MolFromSmiles(smiles)
@@ -45,7 +46,7 @@ def build_smiles_lookup(self):
                     if canonical_smiles not in smiles_lookup:
                         smiles_lookup[canonical_smiles] = []
                     # if the canonical SMILES is already in the lookup, append "different interpretation of the SMILES"
-                    smiles_lookup[canonical_smiles].append((chebi_id, list(chebi_graph.predecessors(chebi_id))))
+                    smiles_lookup[canonical_smiles].append((chebi_id, list(self.chebi_graph.predecessors(chebi_id))))
                 except Exception as e:
                     print(f"Failed to parse SMILES {smiles} for ChEBI ID {chebi_id}: {e}")
         return smiles_lookup
diff --git a/chebifier/prediction_models/chemlog_predictor.py b/chebifier/prediction_models/chemlog_predictor.py
@@ -45,11 +45,24 @@ class ChemlogExtraPredictor(BasePredictor):
 
     def __init__(self, model_name: str, **kwargs):
         super().__init__(model_name, **kwargs)
+        self.chebi_graph = kwargs.get("chebi_graph", None)
         self.classifier = self.CHEMLOG_CLASSIFIER()
 
     def predict_smiles_list(self, smiles_list: list[str]) -> list:
         mol_list = [_smiles_to_mol(smiles) for smiles in smiles_list]
-        return self.classifier.classify(mol_list)
+        res = self.classifier.classify(mol_list)
+        if self.chebi_graph is not None:
+            for sample in res:
+                sample_additions = dict()
+                for cls in sample:
+                    if sample[cls] == 1:
+                        successors = list(self.chebi_graph.predecessors(int(cls)))
+                        if successors:
+                            for succ in successors:
+                                sample_additions[str(succ)] = 1
+                sample.update(sample_additions)
+        return res
+
 
 class ChemlogXMolecularEntityPredictor(ChemlogExtraPredictor):
 
@@ -63,6 +76,7 @@ class ChemlogPeptidesPredictor(BasePredictor):
     def __init__(self, model_name: str, **kwargs):
         super().__init__(model_name, **kwargs)
         self.strategy = "algo"
+        self.chebi_graph = kwargs.get("chebi_graph", None)
         self.classifier_instances = {
             k: v() for k, v in CLASSIFIERS[self.strategy].items()
         }
@@ -81,17 +95,21 @@ def predict_smiles_list(self, smiles_list: list[str]) -> list:
             if mol is None:
                 results.append(None)
             else:
+                pos_labels = [label for label in self.peptide_labels if label in strategy_call(
+                                self.strategy, self.classifier_instances, mol
+                            )["chebi_classes"]]
+                if self.chebi_graph:
+                    indirect_pos_labels = [str(pr) for label in pos_labels for pr in self.chebi_graph.predecessors(int(label))]
+                    pos_labels = list(set(pos_labels + indirect_pos_labels))
                 results.append(
                     {
                         label: (
                             1
                             if label
-                            in strategy_call(
-                                self.strategy, self.classifier_instances, mol
-                            )["chebi_classes"]
+                            in pos_labels
                             else 0
                         )
-                        for label in self.peptide_labels
+                        for label in self.peptide_labels + pos_labels
                     }
                 )