|
| 1 | +import pandas as pd |
| 2 | +from rdkit import Chem |
| 3 | +from rdkit.Chem import AllChem, Draw |
| 4 | +import os |
| 5 | + |
| 6 | +from rdkit.Chem import RDConfig # Allow Contrib packages to be used |
| 7 | +from rdkit.Chem.Crippen import MolLogP as LogP |
| 8 | +from rdkit.Chem.QED import default as QED |
| 9 | +from rdkit.Chem.Descriptors import MolWt |
| 10 | +import sys |
| 11 | +sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score')) |
| 12 | +from sascorer import calculateScore as SAS |
| 13 | + |
| 14 | +def mkdf(directory,output): |
| 15 | + folders = [x for x in os.listdir(directory) if x.startswith('cycle_')] |
| 16 | + if len(folders) == 0: |
| 17 | + raise Exception('No "cycle_" folder found!') |
| 18 | + scores = pd.DataFrame() |
| 19 | + for i,fd in enumerate(folders): |
| 20 | + df = pd.DataFrame() |
| 21 | + fd_path = os.path.join(directory,fd) |
| 22 | + mols = Chem.SDMolSupplier(fd_path+'/ranked_designs.sd') |
| 23 | + df['Design'] = [m.GetProp('Name') for m in mols] |
| 24 | + df['Cycle'] = i |
| 25 | + df['Score'] = [float(m.GetProp('SCORE.INTER')) for m in mols] |
| 26 | + df['SMILES'] = [m.GetProp('SMILES') for m in mols] |
| 27 | + df['Mol'] = [m for m in mols] |
| 28 | + df['LogP'] = [LogP(m) for m in mols] |
| 29 | + df['QED'] = [QED(m) for m in mols] |
| 30 | + df['MolWt'] = [MolWt(m) for m in mols] |
| 31 | + df['SAS'] = [SAS(m) for m in mols] |
| 32 | + scores = pd.concat([scores,df]) |
| 33 | + |
| 34 | + minscores = scores[scores.index == 0] |
| 35 | + minscores = minscores.sort_values('Score') |
| 36 | + minscores.drop_duplicates('SMILES', inplace = True, keep = 'first') |
| 37 | + scores.to_csv(output+'/all_design.csv') |
| 38 | + minscores.to_csv(output+'/best_designs.csv') |
| 39 | + print("DataFrames Saved!") |
| 40 | + return scores, minscores |
| 41 | + |
| 42 | +def combine_designs(directory, output): |
| 43 | + folders = [x for x in os.listdir(directory) if x.startswith('cycle_')] |
| 44 | + if len(folders) == 0: |
| 45 | + raise Exception('No "cycle_" folder found!') |
| 46 | + mols = [] |
| 47 | + best_mols = [] |
| 48 | + wa = Chem.SDWriter(output+'/All_Designs.sdf') |
| 49 | + wb = Chem.SDWriter(output+'/Best_Designs.sdf') |
| 50 | + for fd in folders: |
| 51 | + cycle = fd.strip("cycle_") |
| 52 | + sd = directory+'/'+fd+'/ranked_designs.sd' |
| 53 | + if os.path.exists(sd): |
| 54 | + cir_mols = Chem.SDMolSupplier(sd) |
| 55 | + for i, m in enumerate(cir_mols): |
| 56 | + m.SetProp('Cycle',cycle) |
| 57 | + m.SetProp('MolWeight', str(MolWt(m))) |
| 58 | + m.SetProp('LogP', str(LogP(m))) |
| 59 | + m.SetProp('QED', str(QED(m))) |
| 60 | + m.SetProp('SAS', str(SAS(m))) |
| 61 | + mols.append(m) |
| 62 | + wa.write(m) |
| 63 | + if i == 0: |
| 64 | + # Select the highest score design in the cycle |
| 65 | + best_mols.append(m) |
| 66 | + wb.write(m) |
| 67 | + if int(cycle)%5000 == 0: |
| 68 | + wa.flush() |
| 69 | + wb.flush() |
| 70 | + wa.close() |
| 71 | + wb.close() |
| 72 | + print(len(mols), "total molecules combined from", len(folders),"cycles in\n", directory) |
| 73 | + print(len(best_mols), "selected") |
| 74 | + sys.stdout.flush() |
| 75 | + return mols, best_mols |
| 76 | + |
| 77 | +if __name__ == "__main__": |
| 78 | + import argparse |
| 79 | + parser = argparse.ArgumentParser(description="combine and the ranked_designs.sd in each "+ |
| 80 | + "'cycle_*' folder from Sample and Dock and calculate MolWeight, SAS, LogP, and QED.") |
| 81 | + parser.add_argument("-i","--input", help="input directory that contain folder by cycles") |
| 82 | + parser.add_argument("-o","--outpath", help="output directory for the combined sdf file", |
| 83 | + default='./') |
| 84 | + a = parser.parse_args() |
| 85 | + directory = os.path.abspath(a.input) |
| 86 | + out = os.path.abspath(a.outpath) |
| 87 | + if not os.path.exists(out): |
| 88 | + os.makedirs(out) |
| 89 | + print(out, "Made") |
| 90 | + combine_designs(directory, out) |
| 91 | + mkdf(directory, out) |
0 commit comments