add lookup classifier and chemlog-by-element classifier

sfluegel05 · sfluegel05 · commit c575637c1ac0 · 2025-07-14T17:53:47.000+02:00
diff --git a/chebifier/model_registry.py b/chebifier/model_registry.py
@@ -4,9 +4,10 @@
     WMVwithPPVNPVEnsemble,
 )
 from chebifier.prediction_models import (
-    ChemLogPredictor,
+    ChemlogPeptidesPredictor,
     ElectraPredictor,
     ResGatedPredictor,
+    ChEBILookupPredictor, ChemlogByElementPredictor
 )
 
 ENSEMBLES = {
@@ -19,7 +20,9 @@
 MODEL_TYPES = {
     "electra": ElectraPredictor,
     "resgated": ResGatedPredictor,
-    "chemlog": ChemLogPredictor,
+    "chemlog_peptides": ChemlogPeptidesPredictor,
+    "chebi_lookup": ChEBILookupPredictor,
+    "chemlog_element": ChemlogByElementPredictor
 }
 
 
diff --git a/chebifier/prediction_models/__init__.py b/chebifier/prediction_models/__init__.py
@@ -1,6 +1,7 @@
 from .base_predictor import BasePredictor
-from .chemlog_predictor import ChemLogPredictor
+from .chemlog_predictor import ChemlogPeptidesPredictor, ChemlogByElementPredictor
 from .electra_predictor import ElectraPredictor
 from .gnn_predictor import ResGatedPredictor
-
-__all__ = ["BasePredictor", "ChemLogPredictor", "ElectraPredictor", "ResGatedPredictor"]
+from .chebi_lookup import ChEBILookupPredictor
+__all__ = ["BasePredictor", "ChemlogPeptidesPredictor", "ElectraPredictor", "ResGatedPredictor", "ChEBILookupPredictor",
+           "ChemlogByElementPredictor"]
diff --git a/chebifier/prediction_models/chebi_lookup.py b/chebifier/prediction_models/chebi_lookup.py
@@ -0,0 +1,115 @@
+from chebifier.prediction_models import BasePredictor
+import os
+import networkx as nx
+from rdkit import Chem
+import json
+
+class ChEBILookupPredictor(BasePredictor):
+
+    def __init__(self, model_name: str, description: str = None, chebi_version: int = 241, **kwargs):
+        super().__init__(model_name, **kwargs)
+        self._description = description or "ChEBI Lookup: If the SMILES is equivalent to a ChEBI entry, retrieve the classification of that entry."
+        self.chebi_version = chebi_version
+        self.lookup_table = self.get_smiles_lookup()
+
+    def get_smiles_lookup(self):
+        path = os.path.join("data", f"chebi_v{self.chebi_version}", "smiles_lookup.json")
+        if not os.path.exists(path):
+            smiles_lookup = self.build_smiles_lookup()
+            with open(path, "w", encoding="utf-8") as f:
+                json.dump(smiles_lookup, f, indent=4)
+        else:
+            print("Loading existing SMILES lookup...")
+            with open(path, "r", encoding="utf-8") as f:
+                smiles_lookup = json.load(f)
+        return smiles_lookup
+
+
+    def build_smiles_lookup(self):
+        # todo test
+        from chebai.preprocessing.datasets.chebi import ChEBIOver50
+        self.chebi_dataset = ChEBIOver50(chebi_version=self.chebi_version)
+        self.chebi_dataset._download_required_data()
+        chebi_graph = self.chebi_dataset._extract_class_hierarchy(
+            os.path.join(self.chebi_dataset.raw_dir, "chebi.obo")
+        )
+        smiles_lookup = dict()
+        for chebi_id, smiles in nx.get_node_attributes(chebi_graph, "smiles").items():
+            if smiles is not None:
+                try:
+                    mol = Chem.MolFromSmiles(smiles)
+                    if mol is None:
+                        print(f"Failed to parse SMILES {smiles} for ChEBI ID {chebi_id}")
+                        continue
+                    canonical_smiles = Chem.MolToSmiles(mol)
+                    if canonical_smiles not in smiles_lookup:
+                        smiles_lookup[canonical_smiles] = []
+                    # if the canonical SMILES is already in the lookup, append "different interpretation of the SMILES"
+                    smiles_lookup[canonical_smiles].append((chebi_id, list(chebi_graph.predecessors(chebi_id))))
+                except Exception as e:
+                    print(f"Failed to parse SMILES {smiles} for ChEBI ID {chebi_id}: {e}")
+        return smiles_lookup
+
+
+    def predict_smiles_list(self, smiles_list: list[str]) -> list:
+        predictions = []
+        for smiles in smiles_list:
+            if not smiles:
+                predictions.append(None)
+                continue
+            mol = Chem.MolFromSmiles(smiles)
+            if mol is None:
+                predictions.append(None)
+                continue
+            canonical_smiles = Chem.MolToSmiles(mol)
+            if canonical_smiles in self.lookup_table:
+                parent_candidates = self.lookup_table[canonical_smiles]
+                preds_i = dict()
+                if len(parent_candidates) > 1:
+                    print(f"Multiple matches found in ChEBI for SMILES {smiles}: {', '.join(str(chebi_id) for chebi_id, _ in parent_candidates)}")
+                    for k in list(set(pp for _, p in parent_candidates for pp in p)):
+                        preds_i[str(k)] = 1
+                elif len(parent_candidates) == 1:
+                    chebi_id, parents = parent_candidates[0]
+                    for k in parents:
+                        preds_i[str(k)] = 1
+                else:
+                    preds_i = None
+                predictions.append(preds_i)
+
+        return predictions
+
+    @property
+    def info_text(self):
+        if self._description is None:
+            return "No description is available for this model."
+        return self._description
+
+    def explain_smiles(self, smiles: str) -> dict:
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return {"highlights": [
+                ("text", "The input SMILES could not be parsed into a valid molecule.")
+            ]}
+        canonical_smiles = Chem.MolToSmiles(mol)
+        if canonical_smiles not in self.lookup_table:
+            return {"highlights": [
+                ("text", "The input SMILES does not match any ChEBI entry.")
+            ]}
+        parent_candidates = self.lookup_table[canonical_smiles]
+        return {"highlights": [
+            ("text",
+             f"The ChEBI Lookup matches the canonical version of the input SMILES against ChEBI (v{self.chebi_version})."
+             f" It found {'1 match' if len(parent_candidates) == 1 else f'{len(parent_candidates)} matches'}:"
+             f" {', '.join(f'CHEBI:{chebi_id}' for chebi_id, _ in parent_candidates)}. The predicted classes are the"
+             f" parent classes of the matched ChEBI entries.")
+        ]}
+
+
+if __name__ == "__main__":
+    predictor = ChEBILookupPredictor("ChEBI Lookup")
+    print(predictor.info_text)
+    # Example usage
+    smiles_list = ["CCO", "C1=CC=CC=C1" '*C(=O)OC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)OC(*)=O'] # SMILES with 251 matches in ChEBI
+    predictions = predictor.predict_smiles_list(smiles_list)
+    print(predictions)
diff --git a/chebifier/prediction_models/chemlog_predictor.py b/chebifier/prediction_models/chemlog_predictor.py
@@ -9,6 +9,7 @@
     is_emericellamide,
 )
 from chemlog.cli import CLASSIFIERS, _smiles_to_mol, strategy_call
+from chemlog_extra.alg_classification.by_element_classification import XMolecularEntityClassifier, OrganoXCompoundClassifier
 
 from .base_predictor import BasePredictor
 
@@ -38,8 +39,23 @@
     "Y": "L-tyrosine",
 }
 
+class ChemlogByElementPredictor(BasePredictor):
 
-class ChemLogPredictor(BasePredictor):
+    def __init__(self, model_name: str, **kwargs):
+        super().__init__(model_name, **kwargs)
+        self.x_molecular = XMolecularEntityClassifier()
+        self.organo_x = OrganoXCompoundClassifier()
+
+    def predict_smiles_list(self, smiles_list: list[str]) -> list:
+        mol_list = [_smiles_to_mol(smiles) for smiles in smiles_list]
+        return [
+            {str(cls): 1 for cls in self.x_molecular.classify(mol)[0] + self.organo_x.classify(mol)[0]}
+            if mol
+            else None
+            for mol in mol_list
+        ]
+
+class ChemlogPeptidesPredictor(BasePredictor):
     def __init__(self, model_name: str, **kwargs):
         super().__init__(model_name, **kwargs)
         self.strategy = "algo"
@@ -333,7 +349,12 @@ def build_explain_blocks_proteinogenics(self, proteinogenics, atoms):
 
     def explain_smiles(self, smiles) -> dict:
         info = self.get_chemlog_result_info(smiles)
-        highlight_blocks = self.build_explain_blocks_peptides(info)
+        zero_blocks = [
+            ("text", "Results for peptides and peptide-related classes (e.g. peptide anion, depsipeptide) have been calculated"
+				"with a rule-based system. The following shows which parts of the molecule were identified as relevant"
+				"structures and have influenced the classification.")
+        ]
+        highlight_blocks = zero_blocks + self.build_explain_blocks_peptides(info)
 
         for chebi_id, internal_name in [
             (64372, "emericellamide"),

Original file line number	Diff line number	Diff line change
`@@ -4,9 +4,10 @@`
`4`	`4`	`WMVwithPPVNPVEnsemble,`
`5`	`5`	`)`
`6`	`6`	`from chebifier.prediction_models import (`
`7`		`- ChemLogPredictor,`
	`7`	`+ ChemlogPeptidesPredictor,`
`8`	`8`	`ElectraPredictor,`
`9`	`9`	`ResGatedPredictor,`
	`10`	`+ ChEBILookupPredictor, ChemlogByElementPredictor`
`10`	`11`	`)`
`11`	`12`
`12`	`13`	`ENSEMBLES = {`
`@@ -19,7 +20,9 @@`
`19`	`20`	`MODEL_TYPES = {`
`20`	`21`	`"electra": ElectraPredictor,`
`21`	`22`	`"resgated": ResGatedPredictor,`
`22`		`- "chemlog": ChemLogPredictor,`
	`23`	`+ "chemlog_peptides": ChemlogPeptidesPredictor,`
	`24`	`+ "chebi_lookup": ChEBILookupPredictor,`
	`25`	`+ "chemlog_element": ChemlogByElementPredictor`
`23`	`26`	`}`
`24`	`27`
`25`	`28`