Skip to content

Commit 2bc0e88

Browse files
committed
Multiprocessing on post-process
1 parent 9376cb2 commit 2bc0e88

File tree

3 files changed

+122
-77
lines changed

3 files changed

+122
-77
lines changed

sampledock/SnD/docking.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
def dock(ligs, dock_dir, prmfile, docking_prm, npose, prefix = 'docked'):
99
# ligs must be a list of file path
10-
print('Docking in Progress\t', end = '\r')
10+
print('[INFO]: Docking in Progress\t', end = '\r')
1111
sys.stdout.flush()
1212
procs = []
1313
for i,lig in enumerate(ligs):
@@ -21,7 +21,7 @@ def dock(ligs, dock_dir, prmfile, docking_prm, npose, prefix = 'docked'):
2121
for proc in procs:
2222
# makes sure the docking has completed before sorting the score
2323
proc.wait()
24-
print('Docking Complete! \t', end = '\r')
24+
print('[INFO]: Docking Complete! \t', end = '\r')
2525
sys.stdout.flush()
2626

2727
def sort_pose(dock_dir, sort_by, prefix = None):
@@ -46,7 +46,7 @@ def sort_pose(dock_dir, sort_by, prefix = None):
4646
# retrieve the best pose mol for each design
4747
best_pose = sorted_poses[0]
4848
best_poses.append((float(best_pose.GetProp(sort_by)),best_pose.GetProp('Name'),best_pose))
49-
print('Docked Poses Sorted \t', end = '\r')
49+
print('[INFO]: Docked Poses Sorted \t', end = '\r')
5050
sys.stdout.flush()
5151
# return the sorted tuple (ranked design based on the score of the best pose)
5252
return sorted(best_poses)

sampledock/SnD/post_process.py

Lines changed: 116 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,91 +1,135 @@
1+
# Post processing script for sample and dock generated molecules
2+
13
import pandas as pd
24
from rdkit import Chem
35
from rdkit.Chem import AllChem, Draw
46
import os
7+
from multiprocessing import Pool
8+
from itertools import repeat
59

10+
from rdkit.Chem.PropertyMol import PropertyMol # Allow pickle on mol props for multiprocessing
611
from rdkit.Chem import RDConfig # Allow Contrib packages to be used
7-
from rdkit.Chem.Crippen import MolLogP as LogP
8-
from rdkit.Chem.QED import default as QED
9-
from rdkit.Chem.Descriptors import MolWt
12+
from rdkit.Chem.Crippen import MolLogP as LogP # Lipophilicity
13+
from rdkit.Chem.QED import default as QED # Quantitiative Estimate of Drug-likeness
14+
from rdkit.Chem.Descriptors import MolWt # Mol Weight
1015
import sys
1116
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
12-
from sascorer import calculateScore as SAS
17+
# add path for rdkit Contrib packages
18+
from sascorer import calculateScore as SAS # Sythetic Accessiblilty Score
19+
20+
# Function for calculate mol properties for sd files in each folder for multiprocessing
21+
def process_by_folder(fd, inpath):
22+
cycle = fd.strip("cycle_")
23+
sd = inpath+'/'+fd+'/ranked_designs.sd'
24+
if os.path.exists(sd):
25+
cir_mols = [PropertyMol(m) for m in Chem.SDMolSupplier(sd)]
26+
for i,m in enumerate(cir_mols):
27+
# Calculate properties for each mol
28+
m.SetProp('Cycle',cycle)
29+
m.SetProp('MolWeight', str(MolWt(m)))
30+
m.SetProp('LogP', str(LogP(m)))
31+
m.SetProp('QED', str(QED(m)))
32+
m.SetProp('SAS', str(SAS(m)))
33+
if i == 0:
34+
# Select the highest score design in the cycle
35+
best_mol = m
36+
return cir_mols, best_mol
37+
38+
# calculated mol properties from each cycle and combine mols in one sdf file
39+
def combine_designs(inpath, outpath):
40+
# list the folders in the directory for all cycles
41+
folders = [x for x in os.listdir(inpath) if x.startswith('cycle_')]
42+
# sort folder name
43+
folders.sort(key=lambda x: int(x.strip('cycle_')))
1344

14-
def mkdf(directory,output):
15-
folders = [x for x in os.listdir(directory) if x.startswith('cycle_')]
16-
if len(folders) == 0:
17-
raise Exception('No "cycle_" folder found!')
18-
scores = pd.DataFrame()
19-
for i,fd in enumerate(folders):
20-
df = pd.DataFrame()
21-
fd_path = os.path.join(directory,fd)
22-
mols = Chem.SDMolSupplier(fd_path+'/ranked_designs.sd')
23-
df['Design'] = [m.GetProp('Name') for m in mols]
24-
df['Cycle'] = i
25-
df['Score'] = [float(m.GetProp('SCORE.INTER')) for m in mols]
26-
df['SMILES'] = [m.GetProp('SMILES') for m in mols]
27-
df['Mol'] = [m for m in mols]
28-
df['LogP'] = [LogP(m) for m in mols]
29-
df['QED'] = [QED(m) for m in mols]
30-
df['MolWt'] = [MolWt(m) for m in mols]
31-
df['SAS'] = [SAS(m) for m in mols]
32-
scores = pd.concat([scores,df])
33-
34-
minscores = scores[scores.index == 0]
35-
minscores = minscores.sort_values('Score')
36-
minscores.drop_duplicates('SMILES', inplace = True, keep = 'first')
37-
scores.to_csv(output+'/all_design.csv')
38-
minscores.to_csv(output+'/best_designs.csv')
39-
print("DataFrames Saved!")
40-
return scores, minscores
41-
42-
def combine_designs(directory, output):
43-
folders = [x for x in os.listdir(directory) if x.startswith('cycle_')]
4445
if len(folders) == 0:
4546
raise Exception('No "cycle_" folder found!')
46-
mols = []
47-
best_mols = []
48-
wa = Chem.SDWriter(output+'/All_Designs.sdf')
49-
wb = Chem.SDWriter(output+'/Best_Designs.sdf')
50-
for fd in folders:
51-
cycle = fd.strip("cycle_")
52-
sd = directory+'/'+fd+'/ranked_designs.sd'
53-
if os.path.exists(sd):
54-
cir_mols = Chem.SDMolSupplier(sd)
55-
for i, m in enumerate(cir_mols):
56-
m.SetProp('Cycle',cycle)
57-
m.SetProp('MolWeight', str(MolWt(m)))
58-
m.SetProp('LogP', str(LogP(m)))
59-
m.SetProp('QED', str(QED(m)))
60-
m.SetProp('SAS', str(SAS(m)))
61-
mols.append(m)
62-
wa.write(m)
63-
if i == 0:
64-
# Select the highest score design in the cycle
65-
best_mols.append(m)
66-
wb.write(m)
67-
if int(cycle)%5000 == 0:
68-
wa.flush()
69-
wb.flush()
70-
wa.close()
71-
wb.close()
72-
print(len(mols), "total molecules combined from", len(folders),"cycles in\n", directory)
73-
print(len(best_mols), "selected")
47+
48+
# Multiprocessing
49+
with Pool(processes = os.cpu_count()-1) as pool:
50+
results = pool.starmap(process_by_folder, zip(folders, repeat(inpath)))
51+
52+
# Retrieve results
53+
mol_lists, best_mols = zip(*results)
54+
# Create the list of all mols
55+
all_mols = []
56+
for l in mol_lists:
57+
all_mols.extend(l)
58+
# Convert tuple to list
59+
best_mols = list(best_mols)
60+
61+
print(len(all_mols), "total molecules combined from", len(folders),"cycles in\n", inpath)
62+
print(len(best_mols), "best designs extracted.\n")
63+
sys.stdout.flush()
64+
65+
# Save as sdf
66+
with open(outpath+'/All_Designs.sdf','w') as outfile:
67+
w = Chem.SDWriter(outfile)
68+
for m in all_mols:
69+
w.write(m)
70+
w.close()
71+
72+
with open(outpath+'/Best_Designs.sdf','w') as outfile:
73+
w = Chem.SDWriter(outfile)
74+
for m in best_mols:
75+
w.write(m)
76+
w.close()
77+
print('Mols saved!')
7478
sys.stdout.flush()
75-
return mols, best_mols
79+
80+
return all_mols, best_mols
81+
82+
# Create dataframe with all the properties
83+
def create_df(mol_list):
84+
df = pd.DataFrame()
85+
86+
df['Design'] = [m.GetProp('Name') for m in mol_list]
87+
df['Cycle'] = [int(m.GetProp('Cycle')) for m in mol_list]
88+
df['Score'] = [float(m.GetProp('SCORE.INTER')) for m in mol_list]
89+
df['SMILES'] = [m.GetProp('SMILES') for m in mol_list]
90+
df['Mol'] = [m for m in mol_list]
91+
df['LogP'] = [float(m.GetProp('LogP')) for m in mol_list]
92+
df['QED'] = [float(m.GetProp('QED')) for m in mol_list]
93+
df['MolWt'] = [float(m.GetProp('MolWeight')) for m in mol_list]
94+
df['SAS'] = [float(m.GetProp('SAS')) for m in mol_list]
95+
96+
return df
97+
98+
def mkdf(all_mols, best_mols, outpath):
99+
# Create dataframe from the lists
100+
allscores = create_df(all_mols)
101+
minscores = create_df(best_mols)
102+
103+
# sort the dataframe based on docking scores
104+
sortedscores = minscores.sort_values('Score')
105+
# Drop dulicated entries
106+
sortedscores.drop_duplicates('SMILES', inplace = True, keep = 'first')
107+
108+
# Save as csv
109+
allscores.drop(columns=['Mol']).to_csv(outpath+'/allscores.csv', index = False)
110+
sortedscores.drop(columns=['Mol']).to_csv(outpath+'/sortedscores.csv', index = False)
111+
print('Dataframes saved!')
112+
sys.stdout.flush()
113+
return allscores, minscores
76114

77115
if __name__ == "__main__":
78116
import argparse
79117
parser = argparse.ArgumentParser(description="combine and the ranked_designs.sd in each "+
80118
"'cycle_*' folder from Sample and Dock and calculate MolWeight, SAS, LogP, and QED.")
81119
parser.add_argument("-i","--input", help="input directory that contain folder by cycles")
82-
parser.add_argument("-o","--outpath", help="output directory for the combined sdf file",
83-
default='./')
120+
parser.add_argument("-o","--outpath", help="output directory for the combined sdf file,"+\
121+
"default to ./processed_data")
84122
a = parser.parse_args()
85-
directory = os.path.abspath(a.input)
86-
out = os.path.abspath(a.outpath)
87-
if not os.path.exists(out):
88-
os.makedirs(out)
89-
print(out, "Made")
90-
combine_designs(directory, out)
91-
mkdf(directory, out)
123+
inpath = os.path.abspath(a.input)
124+
125+
if a.outpath:
126+
outpath = os.path.abspath(a.outpath)
127+
else: outpath = inpath+"/All_Designs_Processed/"
128+
129+
if not os.path.exists(outpath):
130+
os.makedirs(outpath)
131+
print("Directory Made:")
132+
print(outpath)
133+
sys.stdout.flush()
134+
allmols, bestmols = combine_designs(inpath, outpath)
135+
mkdf(allmols, bestmols, outpath)

sampledock/__main__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,11 +100,12 @@
100100

101101
print("[INFO]: Cycle %s: %s %s kcal/mol"%(j, smi, energy)+'\t'*6)
102102

103+
print("\n", p.ncycle, "cycles of design finished. Starting post-processing.")
103104
# Create post-process working directory
104105
postproc_wd = os.path.join(wd, "All_Designs_Processed")
105106
os.makedirs(postproc_wd)
106107
# Extract all ranked designs from ejach cycle and combine in one sdf file
107-
combine_designs(wd, postproc_wd)
108+
allmols, bestmols = combine_designs(wd, postproc_wd)
108109
# Create pandas dataframe for summary
109-
mkdf(wd, postproc_wd)
110+
mkdf(allmols, bestmols, postproc_wd)
110111

0 commit comments

Comments
 (0)