Skip to content

Commit 3f576f5

Browse files
committed
tmap plotting implemented
1 parent e1ef375 commit 3f576f5

File tree

7 files changed

+397
-15
lines changed

7 files changed

+397
-15
lines changed

sampledock/SnD/LSH_batch.py

Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
# Batch Processing SMILES strings for LSH Forest data
2+
# .smi file with one smiles string per line is required
3+
# This is part of SampleDock package
4+
#
5+
# COMMERCIAL LICENSE: https://github.com/atfrank/SampleDock/blob/master/LICENSE_COMMERICAL
6+
# NON-COMMERCIAL LICENSE: https://github.com/atfrank/SampleDock/blob/master/LICENSE_NON-COMMERICAL
7+
# Frank Lab 2020-2021
8+
9+
import sys
10+
import os
11+
import pickle
12+
#from multiprocessing import Pool
13+
from tqdm.contrib.concurrent import process_map
14+
from collections import namedtuple
15+
16+
import tmap as tm
17+
from mhfp.encoder import MHFPEncoder
18+
from rdkit.Chem import RDConfig
19+
from rdkit.Chem import SDMolSupplier, MolFromSmiles, MolToSmiles
20+
from rdkit.Chem.Crippen import MolLogP as LogP
21+
from rdkit.Chem.QED import default as QED
22+
from rdkit.Chem.Descriptors import MolWt
23+
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
24+
from sascorer import calculateScore as SAS
25+
26+
def smi_to_mol(smi):
27+
# Wrapper function for proper multiprocessing on converting SMILES to mol
28+
# Chem.MolFromSmiles is Boost.Python.function and cannot be pickled for multiprocessing
29+
mol = MolFromSmiles(smi)
30+
return mol
31+
32+
def file_to_mols(filepath):
33+
if filepath.endswith('.smi'):
34+
print('Converting SMILES to list of Mols')
35+
sys.stdout.flush()
36+
with open(filepath) as infile:
37+
smiles_list = [line.rstrip() for line in infile.readlines()]
38+
# Multiprocessing with all available threads
39+
#with Pool(processes = os.cpu_count()) as pool:
40+
#mols = pool.map(smi_to_mol, smiles_list)
41+
42+
mols = process_map(smi_to_mol, smiles_list, chunksize = 100,
43+
max_workers = a.worker)
44+
45+
mols = [m for m in mols if m]
46+
47+
elif filepath.endswith('.sd') or filepath.endswith('.sdf'):
48+
mols = [mol for mol in SDMolSupplier(filepath) if mol]
49+
50+
else:
51+
raise Exception('Invalid file: {}\n'.format(filepath)+
52+
'.smi, .sd, or .sdf extension is expected')
53+
54+
return mols
55+
56+
def cal_fp_props(mol):
57+
# Wrapper function for multiprocessing
58+
mol_props = Props(MolToSmiles(mol),MolWt(mol),LogP(mol),QED(mol),SAS(mol))
59+
fp = enc.encode_mol(mol)
60+
return fp, mol_props
61+
62+
def single_convert(mol_list, outpath, props_named_tuple = None):
63+
# Multiprocessing with all available threads
64+
#with Pool(processes = os.cpu_count()) as pool:
65+
#fps_and_props = pool.map(cal_fp_props, mol_list)
66+
67+
fps_and_props = process_map(cal_fp_props, mol_list,
68+
chunksize = 100,
69+
max_workers = a.worker)
70+
71+
# Unpack the returned results
72+
fps, props = zip(*fps_and_props)
73+
74+
# Turn props into list of named tuples
75+
if props_named_tuple:
76+
props = [props_named_tuple(*p) for p in props]
77+
78+
with open(os.path.join(outpath,"props.pickle"), "wb") as pf:
79+
pickle.dump(props,pf)
80+
with open(os.path.join(outpath,"fps.pickle"), "wb+") as f:
81+
pickle.dump(fps,f)
82+
83+
return fps, props
84+
85+
def batch_divider(length,batch_size):
86+
# For better reporting progress and including the remainders
87+
# Define the named tuple for indice
88+
Batch = namedtuple('Batch',['batch_num','start','stop'])
89+
90+
# Create an inclusive list of the batch start and stop indice marks
91+
idx = [*range(0,length,batch_size),length]
92+
93+
# List of named tuples
94+
batches = [Batch(*b) for b in zip(range(1,len(idx)),idx[:-1],idx[1:])]
95+
return batches
96+
97+
def batch_convert(mol_list, batch_size, outpath, props_named_tuple = None):
98+
# Divide batches
99+
batches = batch_divider(len(mol_list), batch_size)
100+
101+
propfile = open(os.path.join(outpath,"props.pickle"), "wb")
102+
fpfile = open(os.path.join(outpath,"fps.pickle"), "wb")
103+
104+
for batch_num, start, stop in batches:
105+
# Reporting progress
106+
print('Current batch: {} of {} \t'.format(batch_num,len(batches)),
107+
end = '\r')
108+
sys.stdout.flush()
109+
batch_mol_list = mol_list[start:stop]
110+
# Multiprocessing with all available threads
111+
#with Pool(processes = os.cpu_count()) as pool:
112+
#fps_and_props = pool.map(cal_fp_props, batch_mol_list)
113+
114+
# Use process map for a progress bar
115+
fps_and_props = process_map(cal_fp_props, batch_mol_list,
116+
chunksize = 100,
117+
max_workers = a.worker)
118+
119+
# Unpack the returned results
120+
fps, props = zip(*fps_and_props)
121+
122+
# Turn props into list of named tuples
123+
if props_named_tuple:
124+
props = [props_named_tuple(*p) for p in props]
125+
126+
pickle.dump(props,propfile)
127+
pickle.dump(fps,fpfile)
128+
129+
propfile.close()
130+
fpfile.close()
131+
132+
print('Combining pickle files')
133+
sys.stdout.flush()
134+
props = []
135+
with open(os.path.join(outpath,"props.pickle"), 'rb') as pf:
136+
try:
137+
while True:
138+
props.extend(pickle.load(pf))
139+
except EOFError:
140+
pass
141+
with open(os.path.join(outpath,"props.pickle"), "wb+") as f:
142+
pickle.dump(props,f)
143+
144+
fps = []
145+
with open(os.path.join(outpath,"fps.pickle"), 'rb') as pf:
146+
try:
147+
while True:
148+
fps.extend(pickle.load(pf))
149+
except EOFError:
150+
pass
151+
with open(os.path.join(outpath,"fps.pickle"), "wb+") as f:
152+
pickle.dump(fps,f)
153+
print('Batch processing complete!')
154+
sys.stdout.flush()
155+
return fps, props
156+
157+
def MolsToLSHForest(mol_list, save_path = "./", worker = os.cpu_count()-1, batch_size = None):
158+
159+
print('Available CPU Cores =', os.cpu_count())
160+
print('Number of CPU Core used =', worker)
161+
print('\nTotal Number of Mols =', len(mol_list))
162+
if batch_size: print('Batch Size =', batch_size)
163+
if not os.path.exists(outpath): os.makedirs(outpath)
164+
print('Saving Files at', outpath)
165+
sys.stdout.flush()
166+
167+
if batch_size:
168+
fps, props = batch_convert(mol_list, batch_size, outpath, props_named_tuple = Props)
169+
else:
170+
fps, props = single_convert(mol_list, outpath, props_named_tuple = Props)
171+
172+
print("Loading data and converting to LSH Forest data")
173+
print('Converting MinHash Fingerprints to Vectors')
174+
sys.stdout.flush()
175+
fps = [tm.VectorUint(fp) for fp in fps]
176+
print(len(fps),'Fingerprints Converted')
177+
sys.stdout.flush()
178+
179+
lf.batch_add(fps)
180+
lf.index()
181+
lf.store(os.path.join(outpath,"lf.dat"))
182+
183+
return lf, props
184+
185+
if __name__ == '__main__':
186+
import argparse
187+
parser = argparse.ArgumentParser(prog='LSH Forest Data Converter')
188+
parser.add_argument("filename", metavar = 'F', type=str,
189+
help="Path to the input file, can be either .smi (with no header) or .sdf")
190+
parser.add_argument("-b", "--batch", type=int,
191+
help="Size of the batch for processing, default to all (one batch)",
192+
default = None)
193+
parser.add_argument("-o","--output", type=str, help="Output file path, default to ./LSHData",
194+
default = "./LSHData")
195+
parser.add_argument("-w","--worker", type=int, help="Number of workers (CPU cores) to use for multiprocessing,\
196+
default to the number of available CPU cores minus one",
197+
default = os.cpu_count()-1)
198+
parser.add_argument("-d","--dim", type=int, help="Fingerprint dimension, default to 1024",
199+
default = 1024)
200+
201+
a = parser.parse_args()
202+
outpath = os.path.abspath(a.output)
203+
mols = file_to_mols(a.filename)
204+
205+
# Define a named properties tuple
206+
# To pickle a named tuple correctly:
207+
## 1) The named tupple object has to be declared under __main__
208+
## 2) The declared variable for the named tuple has to match
209+
## the tuple name in the quotation mark!!
210+
Props = namedtuple('Props',['SMILES','MolWt','LogP','QED','SAS'])
211+
212+
# MinHash fingerprints (mhfp) encoder. This is a specialized molecular fingerprint scheme
213+
enc = MHFPEncoder(a.dim)
214+
# Locality Sensitive Hashing Forest
215+
lf = tm.LSHForest(a.dim, 64)
216+
217+
MolsToLSHForest(mol_list = mols,
218+
save_path = outpath,
219+
worker = a.worker,
220+
batch_size = a.batch)
221+

sampledock/SnD/docking.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,10 @@
1-
# Python wrapper for rDock tools
1+
# Python wrapper for rDock tools. Uses rDock CLI
2+
# This is part of SampleDock package
3+
#
4+
# COMMERCIAL LICENSE: https://github.com/atfrank/SampleDock/blob/master/LICENSE_COMMERICAL
5+
# NON-COMMERCIAL LICENSE: https://github.com/atfrank/SampleDock/blob/master/LICENSE_NON-COMMERICAL
6+
# Frank Lab 2020-2021
7+
28
from rdkit import Chem
39
from rdkit.Chem import AllChem
410
import os
@@ -70,7 +76,7 @@ def save_pose(poses_tuple, save_dir):
7076
parser.add_argument("-i","--input", help="directory that contains prepared .sd files")
7177
parser.add_argument("-o","--output", help="directory for output, default to ./test",
7278
default = "./test")
73-
parser.add_argument("-r","--prm", help="path to pocket .prm file")
79+
parser.add_argument("-r","--prm", help="path to pocket.prm file")
7480
parser.add_argument("-n","--npose", help="number of poses for docking, default to 100",
7581
default = 100)
7682
parser.add_argument("-s","--sort", help="filter for sdsort, default to SCORE.INTER",

sampledock/SnD/post_process.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
11
# Post processing script for sample and dock generated molecules
2+
# This is part of SampleDock package
3+
#
4+
# COMMERCIAL LICENSE: https://github.com/atfrank/SampleDock/blob/master/LICENSE_COMMERICAL
5+
# NON-COMMERCIAL LICENSE: https://github.com/atfrank/SampleDock/blob/master/LICENSE_NON-COMMERICAL
6+
# Frank Lab 2020-2021
27

38
import pandas as pd
49
from rdkit import Chem
@@ -48,7 +53,7 @@ def combine_designs(inpath, outpath):
4853
# Multiprocessing
4954
with Pool(processes = os.cpu_count()-1) as pool:
5055
results = pool.starmap(process_by_folder, zip(folders, repeat(inpath)))
51-
56+
5257
# Retrieve results
5358
mol_lists, best_mols = zip(*results)
5459
# Create the list of all mols
@@ -82,7 +87,7 @@ def combine_designs(inpath, outpath):
8287
def create_df(mol_list):
8388
# Create a dataframe with all these mol properties
8489
# These props should exist if the designs are post-processed by funtions above
85-
mol_props = ['Name','Cycle','SCORE.INTER','SMILES','LogP','QED','MolWeight','SAS']
90+
mol_props = ['Name','Cycle','Score','SMILES','LogP','QED','MolWeight','SAS']
8691
df = pd.DataFrame()
8792

8893
# Fill df with lists
@@ -95,9 +100,6 @@ def create_df(mol_list):
95100
df[prop] = df[prop].astype(inferred_type)
96101
except ValueError:
97102
pass
98-
99-
# Add mol objects to the last column
100-
df['Mol'] = mol_list
101103
return df
102104

103105
def mkdf(all_mols, best_mols, outpath):
@@ -111,8 +113,8 @@ def mkdf(all_mols, best_mols, outpath):
111113
sortedscores.drop_duplicates('SMILES', inplace = True, keep = 'first')
112114

113115
# Save as csv
114-
allscores.drop(columns=['Mol']).to_csv(outpath+'/allscores.csv', index = False)
115-
sortedscores.drop(columns=['Mol']).to_csv(outpath+'/sortedscores.csv', index = False)
116+
allscores.to_csv(outpath+'/allscores.csv', index = False)
117+
sortedscores.to_csv(outpath+'/sortedscores.csv', index = False)
116118
print('Dataframes saved!')
117119
sys.stdout.flush()
118120
return allscores, minscores

0 commit comments

Comments
 (0)