Skip to content

Commit c575637

Browse files
committed
add lookup classifier and chemlog-by-element classifier
1 parent df68ecb commit c575637

File tree

4 files changed

+147
-7
lines changed

4 files changed

+147
-7
lines changed

chebifier/model_registry.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,10 @@
44
WMVwithPPVNPVEnsemble,
55
)
66
from chebifier.prediction_models import (
7-
ChemLogPredictor,
7+
ChemlogPeptidesPredictor,
88
ElectraPredictor,
99
ResGatedPredictor,
10+
ChEBILookupPredictor, ChemlogByElementPredictor
1011
)
1112

1213
ENSEMBLES = {
@@ -19,7 +20,9 @@
1920
MODEL_TYPES = {
2021
"electra": ElectraPredictor,
2122
"resgated": ResGatedPredictor,
22-
"chemlog": ChemLogPredictor,
23+
"chemlog_peptides": ChemlogPeptidesPredictor,
24+
"chebi_lookup": ChEBILookupPredictor,
25+
"chemlog_element": ChemlogByElementPredictor
2326
}
2427

2528

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from .base_predictor import BasePredictor
2-
from .chemlog_predictor import ChemLogPredictor
2+
from .chemlog_predictor import ChemlogPeptidesPredictor, ChemlogByElementPredictor
33
from .electra_predictor import ElectraPredictor
44
from .gnn_predictor import ResGatedPredictor
5-
6-
__all__ = ["BasePredictor", "ChemLogPredictor", "ElectraPredictor", "ResGatedPredictor"]
5+
from .chebi_lookup import ChEBILookupPredictor
6+
__all__ = ["BasePredictor", "ChemlogPeptidesPredictor", "ElectraPredictor", "ResGatedPredictor", "ChEBILookupPredictor",
7+
"ChemlogByElementPredictor"]
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
from chebifier.prediction_models import BasePredictor
2+
import os
3+
import networkx as nx
4+
from rdkit import Chem
5+
import json
6+
7+
class ChEBILookupPredictor(BasePredictor):
8+
9+
def __init__(self, model_name: str, description: str = None, chebi_version: int = 241, **kwargs):
10+
super().__init__(model_name, **kwargs)
11+
self._description = description or "ChEBI Lookup: If the SMILES is equivalent to a ChEBI entry, retrieve the classification of that entry."
12+
self.chebi_version = chebi_version
13+
self.lookup_table = self.get_smiles_lookup()
14+
15+
def get_smiles_lookup(self):
16+
path = os.path.join("data", f"chebi_v{self.chebi_version}", "smiles_lookup.json")
17+
if not os.path.exists(path):
18+
smiles_lookup = self.build_smiles_lookup()
19+
with open(path, "w", encoding="utf-8") as f:
20+
json.dump(smiles_lookup, f, indent=4)
21+
else:
22+
print("Loading existing SMILES lookup...")
23+
with open(path, "r", encoding="utf-8") as f:
24+
smiles_lookup = json.load(f)
25+
return smiles_lookup
26+
27+
28+
def build_smiles_lookup(self):
29+
# todo test
30+
from chebai.preprocessing.datasets.chebi import ChEBIOver50
31+
self.chebi_dataset = ChEBIOver50(chebi_version=self.chebi_version)
32+
self.chebi_dataset._download_required_data()
33+
chebi_graph = self.chebi_dataset._extract_class_hierarchy(
34+
os.path.join(self.chebi_dataset.raw_dir, "chebi.obo")
35+
)
36+
smiles_lookup = dict()
37+
for chebi_id, smiles in nx.get_node_attributes(chebi_graph, "smiles").items():
38+
if smiles is not None:
39+
try:
40+
mol = Chem.MolFromSmiles(smiles)
41+
if mol is None:
42+
print(f"Failed to parse SMILES {smiles} for ChEBI ID {chebi_id}")
43+
continue
44+
canonical_smiles = Chem.MolToSmiles(mol)
45+
if canonical_smiles not in smiles_lookup:
46+
smiles_lookup[canonical_smiles] = []
47+
# if the canonical SMILES is already in the lookup, append "different interpretation of the SMILES"
48+
smiles_lookup[canonical_smiles].append((chebi_id, list(chebi_graph.predecessors(chebi_id))))
49+
except Exception as e:
50+
print(f"Failed to parse SMILES {smiles} for ChEBI ID {chebi_id}: {e}")
51+
return smiles_lookup
52+
53+
54+
def predict_smiles_list(self, smiles_list: list[str]) -> list:
55+
predictions = []
56+
for smiles in smiles_list:
57+
if not smiles:
58+
predictions.append(None)
59+
continue
60+
mol = Chem.MolFromSmiles(smiles)
61+
if mol is None:
62+
predictions.append(None)
63+
continue
64+
canonical_smiles = Chem.MolToSmiles(mol)
65+
if canonical_smiles in self.lookup_table:
66+
parent_candidates = self.lookup_table[canonical_smiles]
67+
preds_i = dict()
68+
if len(parent_candidates) > 1:
69+
print(f"Multiple matches found in ChEBI for SMILES {smiles}: {', '.join(str(chebi_id) for chebi_id, _ in parent_candidates)}")
70+
for k in list(set(pp for _, p in parent_candidates for pp in p)):
71+
preds_i[str(k)] = 1
72+
elif len(parent_candidates) == 1:
73+
chebi_id, parents = parent_candidates[0]
74+
for k in parents:
75+
preds_i[str(k)] = 1
76+
else:
77+
preds_i = None
78+
predictions.append(preds_i)
79+
80+
return predictions
81+
82+
@property
83+
def info_text(self):
84+
if self._description is None:
85+
return "No description is available for this model."
86+
return self._description
87+
88+
def explain_smiles(self, smiles: str) -> dict:
89+
mol = Chem.MolFromSmiles(smiles)
90+
if mol is None:
91+
return {"highlights": [
92+
("text", "The input SMILES could not be parsed into a valid molecule.")
93+
]}
94+
canonical_smiles = Chem.MolToSmiles(mol)
95+
if canonical_smiles not in self.lookup_table:
96+
return {"highlights": [
97+
("text", "The input SMILES does not match any ChEBI entry.")
98+
]}
99+
parent_candidates = self.lookup_table[canonical_smiles]
100+
return {"highlights": [
101+
("text",
102+
f"The ChEBI Lookup matches the canonical version of the input SMILES against ChEBI (v{self.chebi_version})."
103+
f" It found {'1 match' if len(parent_candidates) == 1 else f'{len(parent_candidates)} matches'}:"
104+
f" {', '.join(f'CHEBI:{chebi_id}' for chebi_id, _ in parent_candidates)}. The predicted classes are the"
105+
f" parent classes of the matched ChEBI entries.")
106+
]}
107+
108+
109+
if __name__ == "__main__":
110+
predictor = ChEBILookupPredictor("ChEBI Lookup")
111+
print(predictor.info_text)
112+
# Example usage
113+
smiles_list = ["CCO", "C1=CC=CC=C1" '*C(=O)OC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)OC(*)=O'] # SMILES with 251 matches in ChEBI
114+
predictions = predictor.predict_smiles_list(smiles_list)
115+
print(predictions)

chebifier/prediction_models/chemlog_predictor.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
is_emericellamide,
1010
)
1111
from chemlog.cli import CLASSIFIERS, _smiles_to_mol, strategy_call
12+
from chemlog_extra.alg_classification.by_element_classification import XMolecularEntityClassifier, OrganoXCompoundClassifier
1213

1314
from .base_predictor import BasePredictor
1415

@@ -38,8 +39,23 @@
3839
"Y": "L-tyrosine",
3940
}
4041

42+
class ChemlogByElementPredictor(BasePredictor):
4143

42-
class ChemLogPredictor(BasePredictor):
44+
def __init__(self, model_name: str, **kwargs):
45+
super().__init__(model_name, **kwargs)
46+
self.x_molecular = XMolecularEntityClassifier()
47+
self.organo_x = OrganoXCompoundClassifier()
48+
49+
def predict_smiles_list(self, smiles_list: list[str]) -> list:
50+
mol_list = [_smiles_to_mol(smiles) for smiles in smiles_list]
51+
return [
52+
{str(cls): 1 for cls in self.x_molecular.classify(mol)[0] + self.organo_x.classify(mol)[0]}
53+
if mol
54+
else None
55+
for mol in mol_list
56+
]
57+
58+
class ChemlogPeptidesPredictor(BasePredictor):
4359
def __init__(self, model_name: str, **kwargs):
4460
super().__init__(model_name, **kwargs)
4561
self.strategy = "algo"
@@ -333,7 +349,12 @@ def build_explain_blocks_proteinogenics(self, proteinogenics, atoms):
333349

334350
def explain_smiles(self, smiles) -> dict:
335351
info = self.get_chemlog_result_info(smiles)
336-
highlight_blocks = self.build_explain_blocks_peptides(info)
352+
zero_blocks = [
353+
("text", "Results for peptides and peptide-related classes (e.g. peptide anion, depsipeptide) have been calculated"
354+
"with a rule-based system. The following shows which parts of the molecule were identified as relevant"
355+
"structures and have influenced the classification.")
356+
]
357+
highlight_blocks = zero_blocks + self.build_explain_blocks_peptides(info)
337358

338359
for chebi_id, internal_name in [
339360
(64372, "emericellamide"),

0 commit comments

Comments
 (0)