Merge branch 'master' of github.com:InformaticsMatters/pipelines

tdudgeon · tdudgeon · commit 63ac14d05446 · 2020-07-29T12:01:25.000+01:00
diff --git a/.travis.yml b/.travis.yml
@@ -8,13 +8,13 @@
 #
 # If you set PUBLISH_IMAGES you must also set the following: -
 #
-# DOCKER_USERNAME       If PUBLISH_IMAGES is 'yes'
-# DOCKER_PASSWORD       If PUBLISH_IMAGES is 'yes'
+# DOCKER_USERNAME
+# DOCKER_PASSWORD
 #
 # -----------------
 #
 # NOTE: Pull requests from foreign repositories will not
-#       result in encrupted variables being set.
+#       result in encrypted variables being set.
 #       So, regardless of the state of PUBLISH_IMAGES,
 #       images will only be published if DOCKER_PASSWORD is defined.
 
@@ -23,39 +23,26 @@ services:
 - docker
 
 stages:
-- name: test
 - name: publish latest
   if: |
     branch = master \
-    AND env(PUBLISH_IMAGES) = yes \
-    AND env(DOCKER_PASSWORD) IS present
+    AND env(PUBLISH_IMAGES) = yes
 - name: publish tag
   if: |
     tag IS present \
-    AND env(PUBLISH_IMAGES) = yes \
-    AND env(DOCKER_PASSWORD) IS present
+    AND env(PUBLISH_IMAGES) = yes
 - name: publish stable
   if: |
     tag IS present \
     AND tag =~ ^([0-9]+\.){1,2}[0-9]+$ \
-    AND env(PUBLISH_IMAGES) = yes \
-    AND env(DOCKER_PASSWORD) IS present
+    AND env(PUBLISH_IMAGES) = yes
 
 jobs:
   include:
 
-  - stage: test
-    name: Test Local Image
-    script:
-    - docker build -t informaticsmatters/rdkit_pipelines:latest -f Dockerfile-rdkit .
-    - docker build -t squonk/rdkit-pipelines-sdposter:latest -f Dockerfile-sdposter .
-    - git clone https://github.com/InformaticsMatters/pipelines-utils.git
-    - cd pipelines-utils/src/groovy
-    - groovy PipelineTester.groovy -indocker
-
   # Publish-stage jobs...
-  # Every successful master build results in a latest image
-  # and every tag results in a tagged image in Docker Hub.
+  # Every successful master build results in a build (and test)
+  # of the latest image and every tag results in a tagged image in Docker Hub.
   # Tags that match a RegEx are considered 'official' tags
   # and also result in a 'stable' image tag.
 
@@ -65,29 +52,38 @@ jobs:
     # Build and push the pipelines-rdkit image and its sd-poster
     - docker build -t informaticsmatters/rdkit_pipelines:latest -f Dockerfile-rdkit .
     - docker build -t squonk/rdkit-pipelines-sdposter:latest -f Dockerfile-sdposter .
-    - docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD"
-    - docker push informaticsmatters/rdkit_pipelines:latest
-    - docker push squonk/rdkit-pipelines-sdposter:latest
+    - git clone https://github.com/InformaticsMatters/pipelines-utils.git
+    - cd pipelines-utils/src/groovy
+    - groovy PipelineTester.groovy -indocker
+    - if [ -n "$DOCKER_PASSWORD" ]; then
+        docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD";
+        docker push informaticsmatters/rdkit_pipelines:latest;
+        docker push squonk/rdkit-pipelines-sdposter:latest;
+      fi
 
   - stage: publish tag
     name: Publish Tagged Image
     script:
     # Build and push the pipelines-rdkit image and its sd-poster
     - docker build -t informaticsmatters/rdkit_pipelines:${TRAVIS_TAG} -f Dockerfile-rdkit .
     - docker build -t squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG} -f Dockerfile-sdposter .
-    - docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD"
-    - docker push informaticsmatters/rdkit_pipelines:${TRAVIS_TAG}
-    - docker push squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG}
+    - if [ -n "$DOCKER_PASSWORD" ]; then
+        docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD";
+        docker push informaticsmatters/rdkit_pipelines:${TRAVIS_TAG};
+        docker push squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG};
+      fi
 
   - stage: publish stable
     name: Publish Stable Image
     script:
     # Pull the corresponding pipelines-rdkit image tag
     # and push it again as 'stable'
-    - docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD"
-    - docker pull informaticsmatters/rdkit_pipelines:${TRAVIS_TAG}
-    - docker pull squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG}
-    - docker tag informaticsmatters/rdkit_pipelines:${TRAVIS_TAG} informaticsmatters/rdkit_pipelines:stable
-    - docker tag squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG} squonk/rdkit-pipelines-sdposter:stable
-    - docker push informaticsmatters/rdkit_pipelines:stable
-    - docker push squonk/rdkit-pipelines-sdposter:stable
+    - if [ -n "$DOCKER_PASSWORD" ]; then
+        docker login -u="$DOCKER_USERNAME" -p="$DOCKER_PASSWORD";
+        docker pull informaticsmatters/rdkit_pipelines:${TRAVIS_TAG};
+        docker pull squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG};
+        docker tag informaticsmatters/rdkit_pipelines:${TRAVIS_TAG} informaticsmatters/rdkit_pipelines:stable;
+        docker tag squonk/rdkit-pipelines-sdposter:${TRAVIS_TAG} squonk/rdkit-pipelines-sdposter:stable;
+        docker push informaticsmatters/rdkit_pipelines:stable;
+        docker push squonk/rdkit-pipelines-sdposter:stable;
+      fi
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 [![Build Status](https://travis-ci.com/InformaticsMatters/pipelines.svg?branch=master)](https://travis-ci.com/InformaticsMatters/pipelines)
 ![GitHub release (latest SemVer including pre-releases)](https://img.shields.io/github/v/release/informaticsmatters/pipelines?include_prereleases)
 
-The project experiments with ways to generate data processing piplelines.
+The project experiments with ways to generate data processing pipelines.
 The aim is to generate some re-usable building blocks that can be piped
 together into more functional pipelines. Their prime initial use is as executors
 for the Squonk Computational Notebook (http://squonk.it) though it is expected
@@ -22,7 +22,7 @@ be coming soon, including some from the Java ecosystem.
 * See [here](src/nextflow/README.md) for more info on running these in Nextflow.
 
 Note: this is experimental, subject to change, and there are no guarantees that things work as expected!
-That said, its already proved to be highly useful in the Squonk Computational Notebook, and if you are interested let us know, and join the fun.
+That said, it's already proved to be highly useful in the Squonk Computational Notebook, and if you are interested let us know, and join the fun.
 
 The code is licensed under the Apache 2.0 license.
 
diff --git a/src/python/pipelines/xchem/xcos.py b/src/python/pipelines/xchem/xcos.py
@@ -101,85 +101,111 @@ def getFeatureMapScore(small_m, large_m, score_mode=FeatMaps.FeatMapScoreMode.Al
 
 
 # This is the main XCOS function
-def getReverseScores(mols, frags, COS_threshold, writer):
+def getReverseScores(mols, frags, score_threshold, writer):
 
     for mol in mols:
-
+        
         # Get the bits
         compound_bits = getBits(mol)
 
         all_scores = []
 
         for bit in compound_bits:
+            
+            # Let's remove wildcard atoms
+            # Removing wildcard atoms does not impact feat score but does lower shape overlay
+            # For scoring should multiply feat score by number of non-wilcard atoms and use
+            # all atoms including wildcard for shape overlay
+            bit_without_wildcard_atoms = Chem.DeleteSubstructs(bit, Chem.MolFromSmarts('[#0]'))
+
+            # Let's only score bits that have more than one atom (do not count wildcard atoms)           
+            # Get number of bit atoms without wildcard atoms
+            no_bit_atoms_without_wild_card = bit_without_wildcard_atoms.GetNumAtoms()
 
             # Get number of bit atoms
             no_bit_atoms = bit.GetNumAtoms()
 
-            scores = []
-
-            for frag_mol in frags:
-
-                # NB reverse SuCOS scoring
-                fm_score = getFeatureMapScore(bit, frag_mol)
-                fm_score = np.clip(fm_score, 0, 1)
-                # Change van der Waals radius scale for stricter overlay
-                protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol, allowReordering=False, vdwScale=0.2)
-                protrude_dist = np.clip(protrude_dist, 0, 1)
-
-                # Get frag name for linking to score
-                frag_name = frag_mol.GetProp('_Name').strip('Mpro-')
-                
-                # Check if MCS yield > 0 atoms
-                mcs_match = rdFMCS.FindMCS([bit,frag_mol],ringMatchesRingOnly=True,matchValences=True)
-                
-                # Get number of atoms in MCS match found
-                no_mcs_atoms = Chem.MolFromSmarts(mcs_match.smartsString).GetNumAtoms()
-
-                if no_mcs_atoms == 0:
-
-                    scores.append((frag_name, 0, no_bit_atoms))
-                
-                if no_mcs_atoms > 0:
-
-                    # NB reverse SuCOS scoring
-                    fm_score = getFeatureMapScore(bit, frag_mol)
-                    fm_score = np.clip(fm_score, 0, 1)
-
-                    # Change van der Waals radius scale for stricter overlay                     
-                    protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol,
-                                                                     allowReordering=False,
-                                                                     vdwScale=0.2)
-                    protrude_dist = np.clip(protrude_dist, 0, 1)
-
-                    reverse_SuCOS_score = 0.5 * fm_score + 0.5 * (1 - protrude_dist)
-
-                    scores.append((frag_name, reverse_SuCOS_score, no_bit_atoms))
-
-            all_scores.append(scores)
-
-            list_dfs = []
-
-            for score in all_scores:
-
-                df = pd.DataFrame(data=score, columns=['Fragment', 'Score', 'No_bit_atoms'])
+            # Only score if enough info in bit to describe a vector - this will bias against 
+            # cases where frag has long aliphatic chain
+            
+            if no_bit_atoms_without_wild_card > 1:
                 
-                # Get maximum scoring fragment for bit match
-                df = df[df['Score'] == df['Score'].max()]
-                list_dfs.append(df)
-
-            final_df = pd.concat(list_dfs)
-
-            # Score 1: the score is scaled by the number of bit atoms
-            score_1 = (final_df.No_bit_atoms * final_df.Score).sum()
-           
-            # Let's only get frags above a threshold
-            final_df = final_df[final_df.Score > COS_threshold]
-
-            # Let#s sort the df by increasing score
-            final_df = final_df.sort_values(by=['Score'], ascending=False)
-
-            # Get the unique fragments above threshold
-            all_frags = pd.unique(final_df.Fragment)
+                scores = []
+
+                for frag_mol in frags:
+                    
+                    # Get frag name for linking to score
+                    frag_name = frag_mol.GetProp('_Name').strip('Mpro-')
+
+                    # Score only if some common structure shared between bit and fragment.
+                    # Check if MCS yield > 0 atoms
+                    mcs_match = rdFMCS.FindMCS([bit,frag_mol], ringMatchesRingOnly=True, matchValences=True)
+                    
+                    # Get mcs_mol from mcs_match
+                    mcs_mol = Chem.MolFromSmarts(mcs_match.smartsString)
+                    
+                    # check if frag has MCS mol
+                    mcs_test = frag_mol.HasSubstructMatch(mcs_mol)
+
+                    if mcs_test:
+                        
+                        # Change van der Waals radius scale for stricter overlay
+                        protrude_dist = rdShapeHelpers.ShapeProtrudeDist(bit, frag_mol, allowReordering=False, vdwScale=0.2)
+                        protrude_dist = np.clip(protrude_dist, 0, 1)
+                        
+                        protrude_score = 1 - protrude_dist
+
+                        # We are comparing small bits relative to large frags
+                        # If overlay poor then assign score of 0
+                        # NB reverse SuCOS scoring. Feat map is also comp
+                        # more expensive
+
+                        if protrude_score > score_threshold:
+                            
+                            fm_score = getFeatureMapScore(bit, frag_mol)
+                            fm_score = np.clip(fm_score, 0, 1)
+                                    
+                            # What about good shape overlay but poor feat match?
+                            # Let's add a cutoff here to prevent good overlays with
+                            # poor feat match - eg. 3 mem ring 2 x C atoms overlay well
+                            # with 2 x aromatic ring Cs
+                            
+                            if fm_score > score_threshold:
+                                # Use modified SuCOS score where feat_score scaled by number of bit atoms 
+                                # without wildcard atoms and the shape overlay score by the number of bit atoms
+                                # including wildcard atoms
+                                scores.append((frag_name, protrude_score,no_bit_atoms,fm_score,no_bit_atoms_without_wild_card))
+                            else:
+                                scores.append((frag_name,0,no_bit_atoms,0,no_bit_atoms_without_wild_card ))
+                        else:
+                            scores.append((frag_name,0,no_bit_atoms,0,no_bit_atoms_without_wild_card ))
+                    else:
+                        scores.append((frag_name,0,no_bit_atoms,0,no_bit_atoms_without_wild_card ))                  
+                    
+                all_scores.append(scores)
+
+                list_dfs = []
+
+                for score in all_scores:
+
+                    df = pd.DataFrame(data=score, columns = ['Fragment','Shape_score','no_bit_atoms','Feat_score','no_bit_atoms_without_wild_card'])
+                    
+                    # Get maximum scoring fragment for bit match
+                    df['Modified_SuCOS_score'] = 0.5 * (df.Feat_score * df.no_bit_atoms_without_wild_card) + 0.5 * (df.Shape_score * df.no_bit_atoms)
+                    df = df[df['Modified_SuCOS_score'] == df['Modified_SuCOS_score'].max()]
+                    list_dfs.append(df)
+
+                final_df = pd.concat(list_dfs)
+
+        # Score 1: the score is scaled by the number of bit atoms
+        score_1 = final_df.Modified_SuCOS_score.sum()
+
+        # Let's only get frags with a score > 0 
+        #final_df['SuCOS_score'] = 0.5 * final_df.Feat_score + 0.5 * final_df.Shape_score
+        final_df = final_df[final_df.Modified_SuCOS_score > 0]
+        
+        # Get the unique fragments above threshold
+        all_frags = pd.unique(final_df.Fragment)
 
         # Add props we want
         mol.SetProp(field_XCosRefMols, ','.join(all_frags))
@@ -204,8 +230,8 @@ def process(molecules, fragments, writer):
     else:
         utils.log('Using', len(frag_mol_list), 'fragments. No errors')
 
-    #mols, frags, COS_threshold, writer
-    getReverseScores(molecules, frag_mol_list, 0.40, writer)
+    #mols, frags, score_threshold, writer
+    getReverseScores(molecules, frag_mol_list, 0.5, writer)
 
 
 def main():