Multiprocessing on post-process

Truman-Xu · Truman-Xu · commit 2bc0e8883c8d · 2020-10-04T22:27:40.000-04:00
diff --git a/sampledock/SnD/docking.py b/sampledock/SnD/docking.py
@@ -7,7 +7,7 @@
 
 def dock(ligs, dock_dir, prmfile, docking_prm, npose, prefix = 'docked'):
     # ligs must be a list of file path
-    print('Docking in Progress\t', end = '\r')
+    print('[INFO]: Docking in Progress\t', end = '\r')
     sys.stdout.flush()
     procs = []
     for i,lig in enumerate(ligs):
@@ -21,7 +21,7 @@ def dock(ligs, dock_dir, prmfile, docking_prm, npose, prefix = 'docked'):
     for proc in procs:
         # makes sure the docking has completed before sorting the score
         proc.wait()
-    print('Docking Complete!  \t', end = '\r')
+    print('[INFO]: Docking Complete!  \t', end = '\r')
     sys.stdout.flush()
 
 def sort_pose(dock_dir, sort_by, prefix = None):
@@ -46,7 +46,7 @@ def sort_pose(dock_dir, sort_by, prefix = None):
         # retrieve the best pose mol for each design
         best_pose = sorted_poses[0]
         best_poses.append((float(best_pose.GetProp(sort_by)),best_pose.GetProp('Name'),best_pose))
-    print('Docked Poses Sorted       \t', end = '\r')
+    print('[INFO]: Docked Poses Sorted       \t', end = '\r')
     sys.stdout.flush()
     # return the sorted tuple (ranked design based on the score of the best pose)
     return sorted(best_poses)
diff --git a/sampledock/SnD/post_process.py b/sampledock/SnD/post_process.py
@@ -1,91 +1,135 @@
+# Post processing script for sample and dock generated molecules
+
 import pandas as pd
 from rdkit import Chem
 from rdkit.Chem import AllChem, Draw
 import os
+from multiprocessing import Pool
+from itertools import repeat
 
+from rdkit.Chem.PropertyMol import PropertyMol # Allow pickle on mol props for multiprocessing
 from rdkit.Chem import RDConfig # Allow Contrib packages to be used
-from rdkit.Chem.Crippen import MolLogP as LogP
-from rdkit.Chem.QED import default as QED
-from rdkit.Chem.Descriptors import MolWt
+from rdkit.Chem.Crippen import MolLogP as LogP # Lipophilicity
+from rdkit.Chem.QED import default as QED # Quantitiative Estimate of Drug-likeness
+from rdkit.Chem.Descriptors import MolWt # Mol Weight
 import sys
 sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
-from sascorer import calculateScore as SAS
+# add path for rdkit Contrib packages
+from sascorer import calculateScore as SAS # Sythetic Accessiblilty Score
+
+# Function for calculate mol properties for sd files in each folder for multiprocessing
+def process_by_folder(fd, inpath):
+    cycle = fd.strip("cycle_")
+    sd = inpath+'/'+fd+'/ranked_designs.sd'
+    if os.path.exists(sd):
+        cir_mols = [PropertyMol(m) for m in Chem.SDMolSupplier(sd)]
+        for i,m in enumerate(cir_mols):
+            # Calculate properties for each mol
+            m.SetProp('Cycle',cycle)
+            m.SetProp('MolWeight', str(MolWt(m)))
+            m.SetProp('LogP', str(LogP(m)))
+            m.SetProp('QED', str(QED(m)))
+            m.SetProp('SAS', str(SAS(m)))
+            if i == 0: 
+                # Select the highest score design in the cycle
+                best_mol = m
+    return cir_mols, best_mol
+
+# calculated mol properties from each cycle and combine mols in one sdf file
+def combine_designs(inpath, outpath):
+    # list the folders in the directory for all cycles
+    folders = [x for x in os.listdir(inpath) if x.startswith('cycle_')]
+    # sort folder name
+    folders.sort(key=lambda x: int(x.strip('cycle_')))
 
-def mkdf(directory,output):
-    folders = [x for x in os.listdir(directory) if x.startswith('cycle_')]
-    if len(folders) == 0:
-        raise Exception('No "cycle_" folder found!')
-    scores = pd.DataFrame()
-    for i,fd in enumerate(folders):
-        df = pd.DataFrame()
-        fd_path = os.path.join(directory,fd)
-        mols = Chem.SDMolSupplier(fd_path+'/ranked_designs.sd')
-        df['Design'] = [m.GetProp('Name') for m in mols]
-        df['Cycle'] = i
-        df['Score'] = [float(m.GetProp('SCORE.INTER')) for m in mols]
-        df['SMILES'] = [m.GetProp('SMILES') for m in mols]
-        df['Mol'] = [m for m in mols]
-        df['LogP'] = [LogP(m) for m in mols]
-        df['QED'] = [QED(m) for m in mols]
-        df['MolWt'] = [MolWt(m) for m in mols]
-        df['SAS'] = [SAS(m) for m in mols]
-        scores = pd.concat([scores,df])
-
-    minscores = scores[scores.index == 0]
-    minscores = minscores.sort_values('Score')
-    minscores.drop_duplicates('SMILES', inplace = True, keep = 'first')
-    scores.to_csv(output+'/all_design.csv')
-    minscores.to_csv(output+'/best_designs.csv')
-    print("DataFrames Saved!")
-    return scores, minscores
-
-def combine_designs(directory, output):
-    folders = [x for x in os.listdir(directory) if x.startswith('cycle_')]
     if len(folders) == 0:
         raise Exception('No "cycle_" folder found!')
-    mols = []
-    best_mols = []
-    wa = Chem.SDWriter(output+'/All_Designs.sdf')
-    wb = Chem.SDWriter(output+'/Best_Designs.sdf')
-    for fd in folders:
-        cycle = fd.strip("cycle_")
-        sd = directory+'/'+fd+'/ranked_designs.sd'
-        if os.path.exists(sd):
-            cir_mols = Chem.SDMolSupplier(sd)
-            for i, m in enumerate(cir_mols):
-                m.SetProp('Cycle',cycle)
-                m.SetProp('MolWeight', str(MolWt(m)))
-                m.SetProp('LogP', str(LogP(m)))
-                m.SetProp('QED', str(QED(m)))
-                m.SetProp('SAS', str(SAS(m)))
-                mols.append(m)
-                wa.write(m)
-                if i == 0: 
-                    # Select the highest score design in the cycle
-                    best_mols.append(m)
-                    wb.write(m)
-                if int(cycle)%5000 == 0: 
-                    wa.flush()
-                    wb.flush()
-    wa.close()
-    wb.close()
-    print(len(mols), "total molecules combined from", len(folders),"cycles in\n", directory)
-    print(len(best_mols), "selected")
+    
+    # Multiprocessing
+    with Pool(processes = os.cpu_count()-1) as pool:
+        results = pool.starmap(process_by_folder, zip(folders, repeat(inpath)))
+
+    # Retrieve results
+    mol_lists, best_mols = zip(*results)
+    # Create the list of all mols
+    all_mols = []
+    for l in mol_lists:
+        all_mols.extend(l)
+    # Convert tuple to list
+    best_mols = list(best_mols)
+
+    print(len(all_mols), "total molecules combined from", len(folders),"cycles in\n", inpath)
+    print(len(best_mols), "best designs extracted.\n")
+    sys.stdout.flush()
+
+    # Save as sdf
+    with open(outpath+'/All_Designs.sdf','w') as outfile:
+        w = Chem.SDWriter(outfile)
+        for m in all_mols:
+            w.write(m)
+        w.close()
+
+    with open(outpath+'/Best_Designs.sdf','w') as outfile:
+        w = Chem.SDWriter(outfile)
+        for m in best_mols:
+            w.write(m)
+        w.close()
+    print('Mols saved!')
     sys.stdout.flush()
-    return mols, best_mols
+
+    return all_mols, best_mols
+
+# Create dataframe with all the properties
+def create_df(mol_list):
+    df = pd.DataFrame()
+
+    df['Design'] = [m.GetProp('Name') for m in mol_list]
+    df['Cycle'] = [int(m.GetProp('Cycle')) for m in mol_list]
+    df['Score'] = [float(m.GetProp('SCORE.INTER')) for m in mol_list]
+    df['SMILES'] = [m.GetProp('SMILES') for m in mol_list]
+    df['Mol'] = [m for m in mol_list]
+    df['LogP'] = [float(m.GetProp('LogP')) for m in mol_list]
+    df['QED'] = [float(m.GetProp('QED')) for m in mol_list]
+    df['MolWt'] = [float(m.GetProp('MolWeight')) for m in mol_list]
+    df['SAS'] = [float(m.GetProp('SAS')) for m in mol_list]
+
+    return df
+
+def mkdf(all_mols, best_mols, outpath):
+    # Create dataframe from the lists
+    allscores = create_df(all_mols)
+    minscores = create_df(best_mols)
+
+    # sort the dataframe based on docking scores
+    sortedscores = minscores.sort_values('Score')
+    # Drop dulicated entries
+    sortedscores.drop_duplicates('SMILES', inplace = True, keep = 'first')
+
+    # Save as csv
+    allscores.drop(columns=['Mol']).to_csv(outpath+'/allscores.csv', index = False)
+    sortedscores.drop(columns=['Mol']).to_csv(outpath+'/sortedscores.csv', index = False)
+    print('Dataframes saved!')
+    sys.stdout.flush()
+    return allscores, minscores
 
 if __name__ == "__main__":
     import argparse
     parser = argparse.ArgumentParser(description="combine and the ranked_designs.sd in each "+
                                      "'cycle_*' folder from Sample and Dock and calculate MolWeight, SAS, LogP, and QED.")
     parser.add_argument("-i","--input", help="input directory that contain folder by cycles")
-    parser.add_argument("-o","--outpath", help="output directory for the combined sdf file",
-                        default='./')
+    parser.add_argument("-o","--outpath", help="output directory for the combined sdf file,"+\
+                        "default to ./processed_data")
     a = parser.parse_args()
-    directory = os.path.abspath(a.input)
-    out = os.path.abspath(a.outpath)
-    if not os.path.exists(out): 
-        os.makedirs(out)
-        print(out, "Made")
-    combine_designs(directory, out)
-    mkdf(directory, out)
+    inpath = os.path.abspath(a.input)
+
+    if a.outpath:
+        outpath = os.path.abspath(a.outpath)
+    else: outpath = inpath+"/All_Designs_Processed/"
+    
+    if not os.path.exists(outpath): 
+        os.makedirs(outpath)
+        print("Directory Made:")
+        print(outpath)
+        sys.stdout.flush()
+    allmols, bestmols = combine_designs(inpath, outpath)
+    mkdf(allmols, bestmols, outpath)
diff --git a/sampledock/__main__.py b/sampledock/__main__.py
@@ -100,11 +100,12 @@
 
     print("[INFO]: Cycle %s: %s %s kcal/mol"%(j, smi, energy)+'\t'*6)
 
+print("\n", p.ncycle, "cycles of design finished. Starting post-processing.")
 # Create post-process working directory
 postproc_wd = os.path.join(wd, "All_Designs_Processed")
 os.makedirs(postproc_wd)
 # Extract all ranked designs from ejach cycle and combine in one sdf file
-combine_designs(wd, postproc_wd)
+allmols, bestmols = combine_designs(wd, postproc_wd)
 # Create pandas dataframe for summary
-mkdf(wd, postproc_wd)
+mkdf(allmols, bestmols, postproc_wd)