Merge pull request #40 from Waztom/XCOS2

tdudgeon · web-flow · commit dbb5608be0bc · 2020-07-24T14:13:42.000+01:00
Addded MCs filter to XCOS and removed Scores 2-3
diff --git a/src/python/pipelines/xchem/xcos.py b/src/python/pipelines/xchem/xcos.py
@@ -25,12 +25,12 @@
 from rdkit.Chem.FeatMaps import FeatMaps
 from rdkit.Chem import AllChem, rdShapeHelpers
 from rdkit import RDConfig
+from rdkit.Chem import rdFMCS
 
 import os, argparse
 
 import numpy as np
 import pandas as pd
-from sklearn.neighbors import NearestNeighbors
 
 from datetime import datetime
 
@@ -40,8 +40,6 @@
 field_XCosRefMols = "XCos_RefMols"
 field_XCosNumHits = "XCos_NumHits"
 field_XCosScore1 = "XCos_Score1"
-field_XCosScore2 = "XCos_Score2"
-field_XCosScore3 = "XCos_Score3"
 
 
 date = datetime.today().strftime('%Y-%m-%d')
@@ -101,168 +99,15 @@ def getFeatureMapScore(small_m, large_m, score_mode=FeatMaps.FeatMapScoreMode.Al
     except ZeroDivisionError:
         return 0
 
-def getNumberfeats(mol):
-
-    featLists = []
-    rawFeats = fdef.GetFeaturesForMol(mol)
-    # filter that list down to only include the ones we're intereted in
-    featLists.append([f for f in rawFeats if f.GetFamily() in keep])
-
-    return len(featLists)
-
-
-def getFeatureMapXCOS(mol_list):
-    allFeats = []
-    for m in mol_list:
-
-        rawFeats = fdef.GetFeaturesForMol(m)
-        featDeats = [(f.GetType(),
-                      f.GetPos().x,
-                      f.GetPos().y,
-                      f.GetPos().z) for f in rawFeats if f.GetFamily() in keep]
-
-        allFeats.append(featDeats)
-
-
-    feature_map_df = pd.DataFrame([t for lst in allFeats for t in lst],
-                                  columns =['featType', 'x', 'y', 'z'])
-
-    return feature_map_df
-
-
-def getFeatureAgg(feature_map_df, rad_thresh):
-
-    # Group data into unique feature types
-    grouped_df = feature_map_df.groupby('featType')
-
-    data_to_add = []
-
-    for group_name, df_group in grouped_df:
-
-        # Reset index df
-        df_group = df_group.reset_index()
-
-        if len(df_group) == 1:
-
-            data_to_add.append(df_group)
-
-        if len(df_group) > 1:
-
-            # Get feature name
-            feat_name = df_group.featType.unique()[0]
-
-            # Use radius neighbours to find features within
-            # spere with radius thresh
-            neigh = NearestNeighbors(radius=rad_thresh)
-
-            while len(df_group) > 0:
-
-                neigh.fit(df_group[['x','y','z']])
-
-                # Get distances and indices of neigbours within radius threshold
-                rng = neigh.radius_neighbors()
-                neigh_dist = rng[0][0]
-                neigh_indices = rng[1][0]
-
-                # Append the first index - NB clustering done relative to index 0
-                neigh_indices = list(np.append(0, neigh_indices))
-
-                # Calculate average x,y,z coords for features in similar loc
-                x_avg = np.mean(df_group.iloc[neigh_indices].x)
-                y_avg = np.mean(df_group.iloc[neigh_indices].y)
-                z_avg = np.mean(df_group.iloc[neigh_indices].z)
-
-                # Add feature with average x, y and z values
-                new_row = [(feat_name, x_avg, y_avg, z_avg)]
-
-                cluster_df = pd.DataFrame(data=new_row, columns = ['featType', 'x', 'y', 'z'])
-
-                data_to_add.append(cluster_df)
-
-                # Remove indices of clustered neigbours
-                df_group = df_group.drop(df_group.index[neigh_indices])
-
-    # Create single DF from list of dfs
-    clustered_df = pd.concat(data_to_add)
-
-    return clustered_df
-
 
 # This is the main XCOS function
-def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshold, COS_threshold, writer):
+def getReverseScores(mols, frags, COS_threshold, writer):
 
     for mol in mols:
 
         # Get the bits
         compound_bits = getBits(mol)
 
-        # We are going to include a feature mapping score, where the
-        # number of features of the compound matching the clustered feats
-        # within a threshold are found
-
-        # Get feature map of compound bits as df
-        feature_map_bits = getFeatureMapXCOS(compound_bits)
-
-        # Group data into unique feature types
-        grouped_df = feature_map_bits.groupby('featType')
-
-        no_feats_matched = []
-        dist_feats_matched = []
-
-        # Use radius neighbours to find features within
-        # sphere with radius thresh
-        neigh = NearestNeighbors(radius=rad_threshold)
-
-        # Loop through grouped features
-        for group_name, df_group in grouped_df:
-
-            # Get feat name
-            feat_name = df_group.featType.unique()[0]
-
-            # Get similar feats from cluster df
-            cluster_test = clustered_df[clustered_df.featType == feat_name]
-
-            # Reset index df
-            df_group = df_group.reset_index()
-
-            if len(cluster_test) == 1:
-
-                # Calculate distances
-                x1_sub_x2 = (cluster_test.iloc[0].x - df_group.iloc[0].x) ** 2
-                y1_sub_y2 = (cluster_test.iloc[0].y - df_group.iloc[0].y) ** 2
-                z1_sub_z2 = (cluster_test.iloc[0].z - df_group.iloc[0].z) ** 2
-
-                diff_sum = x1_sub_x2 + y1_sub_y2 + z1_sub_z2
-
-                dist = diff_sum ** 0.5
-
-                if dist < rad_threshold:
-                    # Let's get the number of feats matched
-                    no_feats_matched.append(1)
-
-                    # Let's get the distance of the feats matched
-                    dist_feats_matched.append([dist])
-
-            if len(cluster_test) > 1:
-                neigh.fit(cluster_test[['x', 'y', 'z']])
-
-                while len(df_group) > 0:
-                    # Get distances and indices of neigbours within radius threshold
-                    feat_coords = [[df_group.iloc[0].x, df_group.iloc[0].y, df_group.iloc[0].z]]
-                    rng = neigh.radius_neighbors(feat_coords)
-
-                    neigh_dist = rng[0][0]
-                    neigh_indices = rng[1][0]
-
-                    # Let's get the number of feats matched
-                    no_feats_matched.append(len(neigh_indices))
-
-                    # Remove index 0 of df_group
-                    df_group = df_group.drop(df_group.index[0])
-
-        # Get total number of feat matches
-        no_feats = np.sum(no_feats_matched)
-
         all_scores = []
 
         for bit in compound_bits:
@@ -271,53 +116,62 @@ def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshol
             no_bit_atoms = bit.GetNumAtoms()
 
             scores = []
+
             for frag_mol in frags:
+
                 # NB reverse SuCOS scoring
                 fm_score = getFeatureMapScore(bit, frag_mol)
                 fm_score = np.clip(fm_score, 0, 1)
                 # Change van der Waals radius scale for stricter overlay
                 protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol, allowReordering=False, vdwScale=0.2)
                 protrude_dist = np.clip(protrude_dist, 0, 1)
 
-                reverse_SuCOS_score = 0.5 * fm_score + 0.5 * (1 - protrude_dist)
+                # Get frag name for linking to score
+                frag_name = frag_mol.GetProp('_Name').strip('Mpro-')
+                
+                # Check if MCS yield > 0 atoms
+                mcs_match = rdFMCS.FindMCS([bit,frag_mol],ringMatchesRingOnly=True,matchValences=True)
+                
+                # Get number of atoms in MCS match found
+                no_mcs_atoms = Chem.MolFromSmarts(mcs_match.smartsString).GetNumAtoms()
 
-                # Get number of feats from bit for scaling score
-                no_bit_feats = getNumberfeats(bit)
+                if no_mcs_atoms == 0:
 
-                # Get some info and append to list
-                frag_name = frag_mol.GetProp('_Name').strip('Mpro-')
+                    scores.append((frag_name, 0, no_bit_atoms))
+                
+                if no_mcs_atoms > 0:
+
+                    # NB reverse SuCOS scoring
+                    fm_score = getFeatureMapScore(bit, frag_mol)
+                    fm_score = np.clip(fm_score, 0, 1)
+
+                    # Change van der Waals radius scale for stricter overlay                     
+                    protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol,
+                                                                     allowReordering=False,
+                                                                     vdwScale=0.2)
+                    protrude_dist = np.clip(protrude_dist, 0, 1)
+
+                    reverse_SuCOS_score = 0.5 * fm_score + 0.5 * (1 - protrude_dist)
 
-                scores.append((frag_name, reverse_SuCOS_score, no_bit_atoms, no_bit_feats))
+                    scores.append((frag_name, reverse_SuCOS_score, no_bit_atoms))
 
             all_scores.append(scores)
 
             list_dfs = []
+
             for score in all_scores:
-                df = pd.DataFrame(data=score, columns=['Fragment', 'Score', 'No_bit_atoms', 'No_bit_feats'])
+
+                df = pd.DataFrame(data=score, columns=['Fragment', 'Score', 'No_bit_atoms'])
+                
                 # Get maximum scoring fragment for bit match
                 df = df[df['Score'] == df['Score'].max()]
                 list_dfs.append(df)
 
             final_df = pd.concat(list_dfs)
 
-            # Get total bit score and some denominator terms
-            bits_score = (final_df.No_bit_atoms * final_df.Score).sum()
-            total_atoms = final_df.No_bit_atoms.sum()
-            feat_match_fraction = no_feats / no_clustered_feats
-
             # Score 1: the score is scaled by the number of bit atoms
-            score_1 = bits_score
-
-            # Score 2: the score is scaled by the number of bit atoms
-            # penalised by the fraction of feats matched
-            # the to total number feats clustered
-            score_2 = score_1 * feat_match_fraction
-
-            # Score 3: the score is determined by the fraction of matching
-            # features to the clustered features within a threshold. This
-            # should yield similar values to Tim's Featurestein method?
-            score_3 = feat_match_fraction
-
+            score_1 = (final_df.No_bit_atoms * final_df.Score).sum()
+           
             # Let's only get frags above a threshold
             final_df = final_df[final_df.Score > COS_threshold]
 
@@ -331,8 +185,6 @@ def getReverseScores(clustered_df, mols, frags, no_clustered_feats, rad_threshol
         mol.SetProp(field_XCosRefMols, ','.join(all_frags))
         mol.SetIntProp(field_XCosNumHits, len(all_frags))
         mol.SetProp(field_XCosScore1, "{:.4f}".format(score_1))
-        mol.SetProp(field_XCosScore2, "{:.4f}".format(score_2))
-        mol.SetProp(field_XCosScore3, "{:.4f}".format(score_3))
 
         # Write to file
         writer.write(mol)
@@ -352,19 +204,8 @@ def process(molecules, fragments, writer):
     else:
         utils.log('Using', len(frag_mol_list), 'fragments. No errors')
 
-    feature_map_df =  getFeatureMapXCOS(frag_mol_list)
-    utils.log('Feature map dataframe shape:', feature_map_df.shape)
-
-    # Set radius threshold
-    rad_thresh = 1.5
-
-    # Aggregate features using nearest neigbours algo
-    clustered_df = getFeatureAgg(feature_map_df, rad_thresh=rad_thresh)
-    utils.log('Clustered dataframe shape:', clustered_df.shape)
-    no_clustered_feats = len(clustered_df)
-
-    #clustered_df, mols, rad_threshold, COS_threshold, writer
-    getReverseScores(clustered_df, molecules, frag_mol_list, no_clustered_feats, 1.0, 0.50, writer)
+    #mols, frags, COS_threshold, writer
+    getReverseScores(molecules, frag_mol_list, 0.40, writer)
 
 
 def main():
@@ -381,7 +222,6 @@ def main():
     parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed')
     parser.add_argument('--metrics', action='store_true', help='Write metrics')
 
-
     args = parser.parse_args()
     utils.log("XCos Args: ", args)
 
@@ -394,15 +234,11 @@ def main():
     clsMappings[field_XCosRefMols] = "java.lang.String"
     clsMappings[field_XCosNumHits] = "java.lang.Integer"
     clsMappings[field_XCosScore1] = "java.lang.Float"
-    clsMappings[field_XCosScore2] = "java.lang.Float"
-    clsMappings[field_XCosScore3] = "java.lang.Float"
+
     fieldMetaProps.append({"fieldName":field_XCosRefMols,   "values": {"source":source, "description":"XCos reference fragments"}})
     fieldMetaProps.append({"fieldName":field_XCosNumHits,   "values": {"source":source, "description":"XCos number of hits"}})
     fieldMetaProps.append({"fieldName":field_XCosScore1,   "values": {"source":source, "description":"XCos score 1"}})
-    fieldMetaProps.append({"fieldName":field_XCosScore2,   "values": {"source":source, "description":"XCos score 2"}})
-    fieldMetaProps.append({"fieldName":field_XCosScore3,   "values": {"source":source, "description":"XCos score 3"}})
-
-
+    
     frags_input,frags_suppl = rdkit_utils.default_open_input(args.fragments, args.fragments_format)
 
     inputs_file, inputs_supplr = rdkit_utils.default_open_input(args.input, args.informat)