|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# Copyright 2020 Informatics Matters Ltd. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +""" |
| 18 | +Ligand pose scoring using 'FeatureStein'. |
| 19 | +This module generates a merged feature map from a set of 3D ligands. |
| 20 | +The output is a pickle of the merged feature map that can be read by the featurestein-score.py module to |
| 21 | +generate scores. |
| 22 | +""" |
| 23 | + |
| 24 | +from __future__ import print_function |
| 25 | +import argparse, os, sys, gzip, pickle |
| 26 | + |
| 27 | +from rdkit import Chem, rdBase, RDConfig |
| 28 | +from rdkit.Chem import AllChem, rdShapeHelpers |
| 29 | +from rdkit.Chem.FeatMaps import FeatMaps |
| 30 | +from rdkit.Chem.FeatMaps.FeatMapUtils import CombineFeatMaps |
| 31 | + |
| 32 | +from pipelines_utils import parameter_utils, utils |
| 33 | +from pipelines_utils_rdkit import rdkit_utils |
| 34 | + |
| 35 | + |
| 36 | +### start function definitions ######################################### |
| 37 | + |
| 38 | +ffact = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) |
| 39 | + |
| 40 | +fmParams = {} |
| 41 | +for k in ffact.GetFeatureFamilies(): |
| 42 | + fparams = FeatMaps.FeatMapParams() |
| 43 | + fmParams[k] = fparams |
| 44 | + |
| 45 | +exclude = () |
| 46 | + |
| 47 | +def filterFeature(f): |
| 48 | + if f.GetFamily() in exclude: |
| 49 | + return None |
| 50 | + else: |
| 51 | + return f |
| 52 | + |
| 53 | +def getRawFeatures(mol): |
| 54 | + rawFeats = ffact.GetFeaturesForMol(mol) |
| 55 | + # filter that list down to only include the ones we're interested in |
| 56 | + filtered = list(filter(filterFeature, rawFeats)) |
| 57 | + return filtered |
| 58 | + |
| 59 | +def getFeatureMap(mol): |
| 60 | + feats = getRawFeatures(mol) |
| 61 | + return FeatMaps.FeatMap(feats=feats, weights=[1]*len(feats),params=fmParams) |
| 62 | + |
| 63 | +def score_featmaps(fm1, fm2): |
| 64 | + "Generate the score for 2 feature maps" |
| 65 | + return fm1.ScoreFeats(fm2.GetFeatures()) / fm1.GetNumFeatures() |
| 66 | + |
| 67 | +def build_feat_data(mols): |
| 68 | + "Build the feature maps and do the all vs. all comparison" |
| 69 | + fmaps = [] |
| 70 | + scores = [] |
| 71 | + for mol1 in mols: |
| 72 | + fm1 = getFeatureMap(mol1) |
| 73 | + fmaps.append(fm1) |
| 74 | + row = [] |
| 75 | + for mol2 in mols: |
| 76 | + fm2 = getFeatureMap(mol2) |
| 77 | + score = score_featmaps(fm1, fm2) |
| 78 | + row.append(score) |
| 79 | + #print(len(data), len(row), score) |
| 80 | + scores.append(row) |
| 81 | + return fmaps, scores |
| 82 | + |
| 83 | +def find_closest(scores): |
| 84 | + #print('Find closest for', len(scores), len(scores[0])) |
| 85 | + best_score = 0 |
| 86 | + for i in range(len(scores)): |
| 87 | + for j in range(len(scores)): |
| 88 | + if i == j: |
| 89 | + continue |
| 90 | + score = scores[i][j] |
| 91 | + if score > best_score: |
| 92 | + best_score = score |
| 93 | + best_row = i |
| 94 | + best_col = j |
| 95 | + return best_score, best_row, best_col |
| 96 | + |
| 97 | +def merge_feat_maps(fmaps, scores): |
| 98 | + "Merge the 2 closest feature maps, remove them form the data and replace with the merged feature map" |
| 99 | + best_score, best_row, best_col = find_closest(scores) |
| 100 | + #print(best_score, best_row, best_col) |
| 101 | + feat1 = fmaps[best_row] |
| 102 | + feat2 = fmaps[best_col] |
| 103 | + utils.log('Merging', best_row, 'and', best_col, 'with score', best_score, '#features:', feat1.GetNumFeatures(), feat2.GetNumFeatures()) |
| 104 | + merged = CombineFeatMaps(feat1, feat2, mergeMetric=1, mergeTol=1.5, dirMergeMode=0) |
| 105 | + # need to make sure we delete the biggest index first to avoid changing the smaller index |
| 106 | + if best_row > best_col: |
| 107 | + a = best_row |
| 108 | + b = best_col |
| 109 | + else: |
| 110 | + a = best_col |
| 111 | + b = best_row |
| 112 | + |
| 113 | + #print('Initial:', len(fmaps), len(scores), ','.join([str(len(x)) for x in scores])) |
| 114 | + del fmaps[a] |
| 115 | + del fmaps[b] |
| 116 | + del scores[a] |
| 117 | + del scores[b] |
| 118 | + for row in scores: |
| 119 | + del row[a] |
| 120 | + del row[b] |
| 121 | + |
| 122 | + merged_scores = [] |
| 123 | + for i in range(len(fmaps)): |
| 124 | + fmap = fmaps[i] |
| 125 | + score1 = score_featmaps(fmap, merged) |
| 126 | + score2 = score_featmaps(merged, fmap) |
| 127 | + scores[i].append(score1) |
| 128 | + merged_scores.append(score2) |
| 129 | + |
| 130 | + fmaps.append(merged) |
| 131 | + merged_scores.append(score_featmaps(merged, merged)) |
| 132 | + scores.append(merged_scores) |
| 133 | + |
| 134 | + |
| 135 | +def process(inputs, fname): |
| 136 | + |
| 137 | + mols = [m for m in inputs if m] |
| 138 | + fmaps, scores = build_feat_data(mols) |
| 139 | + merged_fmaps = fmaps.copy() |
| 140 | + utils.log('Processing', len(fmaps), 'molecules') |
| 141 | + while len(merged_fmaps) > 1: |
| 142 | + merge_feat_maps(merged_fmaps, scores) |
| 143 | + merged_fmap = merged_fmaps[0] |
| 144 | + pickle.dump(merged_fmap, open(fname, "wb" )) |
| 145 | + utils.log('Wrote merged feature map with', merged_fmap.GetNumFeatures(), 'features as pickle to', fname) |
| 146 | + |
| 147 | + return len(mols), merged_fmap.GetNumFeatures() |
| 148 | + |
| 149 | +### start main execution ######################################### |
| 150 | + |
| 151 | +def main(): |
| 152 | + |
| 153 | + global fmaps |
| 154 | + |
| 155 | + parser = argparse.ArgumentParser(description='FeatureStein generation with RDKit') |
| 156 | + parameter_utils.add_default_input_args(parser) |
| 157 | + parser.add_argument('-f', '--feat-map', default='featurestein.p', help='Name of pickle to generate') |
| 158 | + parser.add_argument('--metrics', action='store_true', help='Write metrics') |
| 159 | + |
| 160 | + args = parser.parse_args() |
| 161 | + utils.log("FeatureStein Args: ", args) |
| 162 | + |
| 163 | + inputs_file, inputs_supplr = rdkit_utils. \ |
| 164 | + default_open_input(args.input, args.informat) |
| 165 | + |
| 166 | + # this does the processing |
| 167 | + num_mols, num_feats = process(inputs_supplr, args.feat_map) |
| 168 | + |
| 169 | + inputs_file.close() |
| 170 | + |
| 171 | + if args.metrics: |
| 172 | + utils.write_metrics(output_base, {'__StatusMessage__': 'Generated ' + num_feats + ' from ' + num_mols + ' molecules', |
| 173 | + '__InputCount__':num_mols, 'RDKitFeatureMap':num_mols}) |
| 174 | + |
| 175 | + |
| 176 | +if __name__ == "__main__": |
| 177 | + main() |
0 commit comments