implemented ensemble and minor bug fix

Truman-Xu · Truman-Xu · commit f274d1bb6101 · 2021-05-03T21:01:11.000-04:00
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "faerun-python"]
+	path = faerun-python
+	url = git@github.com:Truman-Xu/faerun-python.git
diff --git a/faerun-python b/faerun-python
@@ -0,0 +1 @@
+Subproject commit 3f95a69a97c8e26eaedd8275bbdf6b39b933548d
diff --git a/sampledock/SnD/tmap_plotter.py b/sampledock/SnD/tmap_plotter.py
@@ -62,8 +62,8 @@ def df_to_faerun(df,x,y,s,t):
     f = Faerun(view="front", coords=False)
     f.add_scatter(
         # No space in the string allowed for the name, use underscore!!
-        # Cannot start with a number, it has to be a letter!! Weird Bug!!
-        # My guess is that the string is to be converted to a variable name, 
+        # Cannot start with a number, it has to be a letter!!
+        # the string is to be converted to a variable name, 
         # therefore it has to be compatible with python variable naming scheme
         "SampleDock",
         {
@@ -94,8 +94,6 @@ def df_to_faerun(df,x,y,s,t):
         )
     # The first character of the name has to be a letter!
     f.add_tree("SnD_Tree", {"from": s, "to": t}, point_helper="SampleDock")
-    f.plot("SampleDock"+'_space', # name of the .html file
-       template="smiles")
     print('Plotting finished')
     return f
     
@@ -126,4 +124,6 @@ def df_to_faerun(df,x,y,s,t):
     df['s'] = s
     df['t'] = t
     df.to_csv(os.path.join(outpath,"props.csv"),index = False)
-    df_to_faerun(df)
+    f = df_to_faerun(df)
+    f.plot(os.path.join(outpath,"SampleDock"+'_space'), # name of the .html file
+       template="smiles")
diff --git a/sampledock/__main__.py b/sampledock/__main__.py
@@ -13,7 +13,7 @@
 import sys
 import subprocess
 import pickle
-from rdkit import rdBase
+from rdkit import rdBase, Chem
 ## Disable rdkit Logs
 rdBase.DisableLog('rdApp.error')
 from .jtvae import Vocab, JTNNVAE
@@ -74,27 +74,35 @@
     dock(ligs, docking_dir, prmfile, p.docking_prm, p.npose, p.prefix)
     ranked_poses = sort_pose(docking_dir, p.sort_by, p.prefix) 
     save_pose(ranked_poses, design_dir)
-
+    
     ## Generate new design list
-    for energy, name, mol in ranked_poses:
-        smi = mol.GetProp('SMILES')
-        design_list = []
-        try:
-            print('[INFO]: Generating new designs \t', end = '\r')
-            sys.stdout.flush()
-            design_list = jtvae.smiles_gen(smi, p.ndesign)
-        # go to the second best candidate if the best does not give any return
-        except KeyError as key:
-            print('[KeyError]',key,'is not part of the vocabulary')
-            continue
-
-        if len(design_list) != 0: 
-            break 
-
-        else: 
-            print('Current design (%s) has no offspring; trying the next one \r'%name)
-
-    print("[INFO]: Cycle %s: %s %s kcal/mol"%(j, smi, energy)+'\t'*6)
+    if p.ensemble > 1:
+        top_smi_list = [Chem.MolToSmiles(mol) for _, _, mol in ranked_poses[:p.ensemble]]
+        smi = jtvae.find_ensemble(top_smi_list)
+        design_list = jtvae.smiles_gen(smi, p.ndesign)
+        best_score = ranked_poses[0][0]
+        print("[INFO]: Cycle %s: %s  Best Score: %s kcal/mol"%(j, smi, best_score)+'\t'*6)
+    else:
+        for energy, name, mol in ranked_poses:
+            smi = mol.GetProp('SMILES')
+            try:
+                print('[INFO]: Generating new designs \t', end = '\r')
+                sys.stdout.flush()
+                # get new design list for the nex cycle
+                design_list = jtvae.smiles_gen(smi, p.ndesign)
+            # This is due to difference in parsing of SMILES (especially rings)
+            ## TODO: Convert sampledock to OOP structure and use the vectors directly
+            except KeyError as key:
+                print('[KeyError]',key,'is not part of the vocabulary (the model was not trained with this scaffold)')
+                continue
+            # if there are offspring designs, break the loop
+            if len(design_list) != 0: 
+                break 
+            # go to the next candidate if the current one does not give any return
+            else: 
+                print('Current design (%s) has no offspring; trying the next one \r'%name)
+
+        print("[INFO]: Cycle %s: %s %s kcal/mol"%(j, smi, energy)+'\t'*6)
 
 print("\n", p.ncycle, "cycles of design finished. Starting post-processing.")
 # Create post-process working directory
@@ -117,6 +125,7 @@
     pickle.dump((x,y,s,t),f)
 # Create tmap on faerun
 f = df_to_faerun(allscores,x,y,s,t)
-
+f.plot("SampleDock"+'_space', path = postproc_wd, # name and path of the .html file
+       template="smiles")
 with open(os.path.join(postproc_wd,'SampleDock.faerun'), 'wb') as handle:
     pickle.dump(f.create_python_data(), handle, protocol=pickle.HIGHEST_PROTOCOL)
diff --git a/sampledock/jtvae/jtnn_vae.py b/sampledock/jtvae/jtnn_vae.py
@@ -71,7 +71,7 @@ def smiles_gen(self,smiles,ndesign,prob_decode = False):
         ## Convert smiles to one-hot encoding (altered function from original code)
         x_tree, x_mol = self.encode_single_smiles(smiles)
         
-        ## Encode one-hots to z-mean and log var. Following Mueller et al.
+        ## Encode one-hots to z-mean and log variance. Following Mueller et al.
         tree_mean = self.T_mean(x_tree)
         tree_log_var = -torch.abs(self.T_var(x_tree)) 
         mol_mean = self.G_mean(x_mol)
@@ -89,6 +89,32 @@ def smiles_gen(self,smiles,ndesign,prob_decode = False):
                 smiles_list.append(smilesout)    
         return smiles_list
     
+    def find_ensemble(self,smiles_list):
+        z_tree = []
+        z_mol = []
+        for smi in smiles_list:
+            try:
+                x_tree, x_mol = self.encode_single_smiles(smi)
+            # This is due to difference in parsing of SMILES (especially rings)
+            ## TODO: Convert sampledock to OOP structure and use the vectors directly
+            except KeyError as key:
+                print('[KeyError]',key,'is not part of the vocabulary (the model was not trained with this scaffold)')
+                continue
+            tree_mean = self.T_mean(x_tree)
+            tree_log_var = -torch.abs(self.T_var(x_tree)) 
+            mol_mean = self.G_mean(x_mol)
+            mol_log_var = -torch.abs(self.G_var(x_mol))
+            
+            z_tree.append(self.z_vec(tree_mean, tree_log_var))
+            z_mol.append(self.z_vec(mol_mean, mol_log_var))
+        
+        z_tree = torch.cat(z_tree)
+        z_mol = torch.cat(z_mol)
+        
+        return self.decode(z_tree.mean(0).reshape((1,self.latent_size)),
+                            z_mol.mean(0).reshape((1,self.latent_size)),
+                            False)
+        
     def encode_latent(self, jtenc_holder, mpn_holder):
         tree_vecs, _ = self.jtnn(*jtenc_holder)
         mol_vecs = self.mpn(*mpn_holder)

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+[submodule "faerun-python"]`
	`2`	`+ path = faerun-python`
	`3`	`+ url = [email protected]:Truman-Xu/faerun-python.git`