1
+ # Batch Processing SMILES strings for LSH Forest data
2
+ # .smi file with one smiles string per line is required
3
+ # This is part of SampleDock package
4
+ #
5
+ # COMMERCIAL LICENSE: https://github.com/atfrank/SampleDock/blob/master/LICENSE_COMMERICAL
6
+ # NON-COMMERCIAL LICENSE: https://github.com/atfrank/SampleDock/blob/master/LICENSE_NON-COMMERICAL
7
+ # Frank Lab 2020-2021
8
+
9
+ import sys
10
+ import os
11
+ import pickle
12
+ #from multiprocessing import Pool
13
+ from tqdm .contrib .concurrent import process_map
14
+ from collections import namedtuple
15
+
16
+ import tmap as tm
17
+ from mhfp .encoder import MHFPEncoder
18
+ from rdkit .Chem import RDConfig
19
+ from rdkit .Chem import SDMolSupplier , MolFromSmiles , MolToSmiles
20
+ from rdkit .Chem .Crippen import MolLogP as LogP
21
+ from rdkit .Chem .QED import default as QED
22
+ from rdkit .Chem .Descriptors import MolWt
23
+ sys .path .append (os .path .join (RDConfig .RDContribDir , 'SA_Score' ))
24
+ from sascorer import calculateScore as SAS
25
+
26
+ def smi_to_mol (smi ):
27
+ # Wrapper function for proper multiprocessing on converting SMILES to mol
28
+ # Chem.MolFromSmiles is Boost.Python.function and cannot be pickled for multiprocessing
29
+ mol = MolFromSmiles (smi )
30
+ return mol
31
+
32
+ def file_to_mols (filepath ):
33
+ if filepath .endswith ('.smi' ):
34
+ print ('Converting SMILES to list of Mols' )
35
+ sys .stdout .flush ()
36
+ with open (filepath ) as infile :
37
+ smiles_list = [line .rstrip () for line in infile .readlines ()]
38
+ # Multiprocessing with all available threads
39
+ #with Pool(processes = os.cpu_count()) as pool:
40
+ #mols = pool.map(smi_to_mol, smiles_list)
41
+
42
+ mols = process_map (smi_to_mol , smiles_list , chunksize = 100 ,
43
+ max_workers = a .worker )
44
+
45
+ mols = [m for m in mols if m ]
46
+
47
+ elif filepath .endswith ('.sd' ) or filepath .endswith ('.sdf' ):
48
+ mols = [mol for mol in SDMolSupplier (filepath ) if mol ]
49
+
50
+ else :
51
+ raise Exception ('Invalid file: {}\n ' .format (filepath )+
52
+ '.smi, .sd, or .sdf extension is expected' )
53
+
54
+ return mols
55
+
56
+ def cal_fp_props (mol ):
57
+ # Wrapper function for multiprocessing
58
+ mol_props = Props (MolToSmiles (mol ),MolWt (mol ),LogP (mol ),QED (mol ),SAS (mol ))
59
+ fp = enc .encode_mol (mol )
60
+ return fp , mol_props
61
+
62
+ def single_convert (mol_list , outpath , props_named_tuple = None ):
63
+ # Multiprocessing with all available threads
64
+ #with Pool(processes = os.cpu_count()) as pool:
65
+ #fps_and_props = pool.map(cal_fp_props, mol_list)
66
+
67
+ fps_and_props = process_map (cal_fp_props , mol_list ,
68
+ chunksize = 100 ,
69
+ max_workers = a .worker )
70
+
71
+ # Unpack the returned results
72
+ fps , props = zip (* fps_and_props )
73
+
74
+ # Turn props into list of named tuples
75
+ if props_named_tuple :
76
+ props = [props_named_tuple (* p ) for p in props ]
77
+
78
+ with open (os .path .join (outpath ,"props.pickle" ), "wb" ) as pf :
79
+ pickle .dump (props ,pf )
80
+ with open (os .path .join (outpath ,"fps.pickle" ), "wb+" ) as f :
81
+ pickle .dump (fps ,f )
82
+
83
+ return fps , props
84
+
85
+ def batch_divider (length ,batch_size ):
86
+ # For better reporting progress and including the remainders
87
+ # Define the named tuple for indice
88
+ Batch = namedtuple ('Batch' ,['batch_num' ,'start' ,'stop' ])
89
+
90
+ # Create an inclusive list of the batch start and stop indice marks
91
+ idx = [* range (0 ,length ,batch_size ),length ]
92
+
93
+ # List of named tuples
94
+ batches = [Batch (* b ) for b in zip (range (1 ,len (idx )),idx [:- 1 ],idx [1 :])]
95
+ return batches
96
+
97
+ def batch_convert (mol_list , batch_size , outpath , props_named_tuple = None ):
98
+ # Divide batches
99
+ batches = batch_divider (len (mol_list ), batch_size )
100
+
101
+ propfile = open (os .path .join (outpath ,"props.pickle" ), "wb" )
102
+ fpfile = open (os .path .join (outpath ,"fps.pickle" ), "wb" )
103
+
104
+ for batch_num , start , stop in batches :
105
+ # Reporting progress
106
+ print ('Current batch: {} of {} \t ' .format (batch_num ,len (batches )),
107
+ end = '\r ' )
108
+ sys .stdout .flush ()
109
+ batch_mol_list = mol_list [start :stop ]
110
+ # Multiprocessing with all available threads
111
+ #with Pool(processes = os.cpu_count()) as pool:
112
+ #fps_and_props = pool.map(cal_fp_props, batch_mol_list)
113
+
114
+ # Use process map for a progress bar
115
+ fps_and_props = process_map (cal_fp_props , batch_mol_list ,
116
+ chunksize = 100 ,
117
+ max_workers = a .worker )
118
+
119
+ # Unpack the returned results
120
+ fps , props = zip (* fps_and_props )
121
+
122
+ # Turn props into list of named tuples
123
+ if props_named_tuple :
124
+ props = [props_named_tuple (* p ) for p in props ]
125
+
126
+ pickle .dump (props ,propfile )
127
+ pickle .dump (fps ,fpfile )
128
+
129
+ propfile .close ()
130
+ fpfile .close ()
131
+
132
+ print ('Combining pickle files' )
133
+ sys .stdout .flush ()
134
+ props = []
135
+ with open (os .path .join (outpath ,"props.pickle" ), 'rb' ) as pf :
136
+ try :
137
+ while True :
138
+ props .extend (pickle .load (pf ))
139
+ except EOFError :
140
+ pass
141
+ with open (os .path .join (outpath ,"props.pickle" ), "wb+" ) as f :
142
+ pickle .dump (props ,f )
143
+
144
+ fps = []
145
+ with open (os .path .join (outpath ,"fps.pickle" ), 'rb' ) as pf :
146
+ try :
147
+ while True :
148
+ fps .extend (pickle .load (pf ))
149
+ except EOFError :
150
+ pass
151
+ with open (os .path .join (outpath ,"fps.pickle" ), "wb+" ) as f :
152
+ pickle .dump (fps ,f )
153
+ print ('Batch processing complete!' )
154
+ sys .stdout .flush ()
155
+ return fps , props
156
+
157
+ def MolsToLSHForest (mol_list , save_path = "./" , worker = os .cpu_count ()- 1 , batch_size = None ):
158
+
159
+ print ('Available CPU Cores =' , os .cpu_count ())
160
+ print ('Number of CPU Core used =' , worker )
161
+ print ('\n Total Number of Mols =' , len (mol_list ))
162
+ if batch_size : print ('Batch Size =' , batch_size )
163
+ if not os .path .exists (outpath ): os .makedirs (outpath )
164
+ print ('Saving Files at' , outpath )
165
+ sys .stdout .flush ()
166
+
167
+ if batch_size :
168
+ fps , props = batch_convert (mol_list , batch_size , outpath , props_named_tuple = Props )
169
+ else :
170
+ fps , props = single_convert (mol_list , outpath , props_named_tuple = Props )
171
+
172
+ print ("Loading data and converting to LSH Forest data" )
173
+ print ('Converting MinHash Fingerprints to Vectors' )
174
+ sys .stdout .flush ()
175
+ fps = [tm .VectorUint (fp ) for fp in fps ]
176
+ print (len (fps ),'Fingerprints Converted' )
177
+ sys .stdout .flush ()
178
+
179
+ lf .batch_add (fps )
180
+ lf .index ()
181
+ lf .store (os .path .join (outpath ,"lf.dat" ))
182
+
183
+ return lf , props
184
+
185
+ if __name__ == '__main__' :
186
+ import argparse
187
+ parser = argparse .ArgumentParser (prog = 'LSH Forest Data Converter' )
188
+ parser .add_argument ("filename" , metavar = 'F' , type = str ,
189
+ help = "Path to the input file, can be either .smi (with no header) or .sdf" )
190
+ parser .add_argument ("-b" , "--batch" , type = int ,
191
+ help = "Size of the batch for processing, default to all (one batch)" ,
192
+ default = None )
193
+ parser .add_argument ("-o" ,"--output" , type = str , help = "Output file path, default to ./LSHData" ,
194
+ default = "./LSHData" )
195
+ parser .add_argument ("-w" ,"--worker" , type = int , help = "Number of workers (CPU cores) to use for multiprocessing,\
196
+ default to the number of available CPU cores minus one" ,
197
+ default = os .cpu_count ()- 1 )
198
+ parser .add_argument ("-d" ,"--dim" , type = int , help = "Fingerprint dimension, default to 1024" ,
199
+ default = 1024 )
200
+
201
+ a = parser .parse_args ()
202
+ outpath = os .path .abspath (a .output )
203
+ mols = file_to_mols (a .filename )
204
+
205
+ # Define a named properties tuple
206
+ # To pickle a named tuple correctly:
207
+ ## 1) The named tupple object has to be declared under __main__
208
+ ## 2) The declared variable for the named tuple has to match
209
+ ## the tuple name in the quotation mark!!
210
+ Props = namedtuple ('Props' ,['SMILES' ,'MolWt' ,'LogP' ,'QED' ,'SAS' ])
211
+
212
+ # MinHash fingerprints (mhfp) encoder. This is a specialized molecular fingerprint scheme
213
+ enc = MHFPEncoder (a .dim )
214
+ # Locality Sensitive Hashing Forest
215
+ lf = tm .LSHForest (a .dim , 64 )
216
+
217
+ MolsToLSHForest (mol_list = mols ,
218
+ save_path = outpath ,
219
+ worker = a .worker ,
220
+ batch_size = a .batch )
221
+
0 commit comments