1+ from chebifier .prediction_models import BasePredictor
2+ import os
3+ import networkx as nx
4+ from rdkit import Chem
5+ import json
6+
7+ class ChEBILookupPredictor (BasePredictor ):
8+
9+ def __init__ (self , model_name : str , description : str = None , chebi_version : int = 241 , ** kwargs ):
10+ super ().__init__ (model_name , ** kwargs )
11+ self ._description = description or "ChEBI Lookup: If the SMILES is equivalent to a ChEBI entry, retrieve the classification of that entry."
12+ self .chebi_version = chebi_version
13+ self .lookup_table = self .get_smiles_lookup ()
14+
15+ def get_smiles_lookup (self ):
16+ path = os .path .join ("data" , f"chebi_v{ self .chebi_version } " , "smiles_lookup.json" )
17+ if not os .path .exists (path ):
18+ smiles_lookup = self .build_smiles_lookup ()
19+ with open (path , "w" , encoding = "utf-8" ) as f :
20+ json .dump (smiles_lookup , f , indent = 4 )
21+ else :
22+ print ("Loading existing SMILES lookup..." )
23+ with open (path , "r" , encoding = "utf-8" ) as f :
24+ smiles_lookup = json .load (f )
25+ return smiles_lookup
26+
27+
28+ def build_smiles_lookup (self ):
29+ # todo test
30+ from chebai .preprocessing .datasets .chebi import ChEBIOver50
31+ self .chebi_dataset = ChEBIOver50 (chebi_version = self .chebi_version )
32+ self .chebi_dataset ._download_required_data ()
33+ chebi_graph = self .chebi_dataset ._extract_class_hierarchy (
34+ os .path .join (self .chebi_dataset .raw_dir , "chebi.obo" )
35+ )
36+ smiles_lookup = dict ()
37+ for chebi_id , smiles in nx .get_node_attributes (chebi_graph , "smiles" ).items ():
38+ if smiles is not None :
39+ try :
40+ mol = Chem .MolFromSmiles (smiles )
41+ if mol is None :
42+ print (f"Failed to parse SMILES { smiles } for ChEBI ID { chebi_id } " )
43+ continue
44+ canonical_smiles = Chem .MolToSmiles (mol )
45+ if canonical_smiles not in smiles_lookup :
46+ smiles_lookup [canonical_smiles ] = []
47+ # if the canonical SMILES is already in the lookup, append "different interpretation of the SMILES"
48+ smiles_lookup [canonical_smiles ].append ((chebi_id , list (chebi_graph .predecessors (chebi_id ))))
49+ except Exception as e :
50+ print (f"Failed to parse SMILES { smiles } for ChEBI ID { chebi_id } : { e } " )
51+ return smiles_lookup
52+
53+
54+ def predict_smiles_list (self , smiles_list : list [str ]) -> list :
55+ predictions = []
56+ for smiles in smiles_list :
57+ if not smiles :
58+ predictions .append (None )
59+ continue
60+ mol = Chem .MolFromSmiles (smiles )
61+ if mol is None :
62+ predictions .append (None )
63+ continue
64+ canonical_smiles = Chem .MolToSmiles (mol )
65+ if canonical_smiles in self .lookup_table :
66+ parent_candidates = self .lookup_table [canonical_smiles ]
67+ preds_i = dict ()
68+ if len (parent_candidates ) > 1 :
69+ print (f"Multiple matches found in ChEBI for SMILES { smiles } : { ', ' .join (str (chebi_id ) for chebi_id , _ in parent_candidates )} " )
70+ for k in list (set (pp for _ , p in parent_candidates for pp in p )):
71+ preds_i [str (k )] = 1
72+ elif len (parent_candidates ) == 1 :
73+ chebi_id , parents = parent_candidates [0 ]
74+ for k in parents :
75+ preds_i [str (k )] = 1
76+ else :
77+ preds_i = None
78+ predictions .append (preds_i )
79+
80+ return predictions
81+
82+ @property
83+ def info_text (self ):
84+ if self ._description is None :
85+ return "No description is available for this model."
86+ return self ._description
87+
88+ def explain_smiles (self , smiles : str ) -> dict :
89+ mol = Chem .MolFromSmiles (smiles )
90+ if mol is None :
91+ return {"highlights" : [
92+ ("text" , "The input SMILES could not be parsed into a valid molecule." )
93+ ]}
94+ canonical_smiles = Chem .MolToSmiles (mol )
95+ if canonical_smiles not in self .lookup_table :
96+ return {"highlights" : [
97+ ("text" , "The input SMILES does not match any ChEBI entry." )
98+ ]}
99+ parent_candidates = self .lookup_table [canonical_smiles ]
100+ return {"highlights" : [
101+ ("text" ,
102+ f"The ChEBI Lookup matches the canonical version of the input SMILES against ChEBI (v{ self .chebi_version } )."
103+ f" It found { '1 match' if len (parent_candidates ) == 1 else f'{ len (parent_candidates )} matches' } :"
104+ f" { ', ' .join (f'CHEBI:{ chebi_id } ' for chebi_id , _ in parent_candidates )} . The predicted classes are the"
105+ f" parent classes of the matched ChEBI entries." )
106+ ]}
107+
108+
109+ if __name__ == "__main__" :
110+ predictor = ChEBILookupPredictor ("ChEBI Lookup" )
111+ print (predictor .info_text )
112+ # Example usage
113+ smiles_list = ["CCO" , "C1=CC=CC=C1" '*C(=O)OC[C@H](COP(=O)([O-])OCC[N+](C)(C)C)OC(*)=O' ] # SMILES with 251 matches in ChEBI
114+ predictions = predictor .predict_smiles_list (smiles_list )
115+ print (predictions )
0 commit comments