|
| 1 | +#!/usr/bin/env python |
| 2 | + |
| 3 | +# Copyright 2020 Informatics Matters Ltd. |
| 4 | +# |
| 5 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | +# you may not use this file except in compliance with the License. |
| 7 | +# You may obtain a copy of the License at |
| 8 | +# |
| 9 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +# |
| 11 | +# Unless required by applicable law or agreed to in writing, software |
| 12 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | +# See the License for the specific language governing permissions and |
| 15 | +# limitations under the License. |
| 16 | + |
| 17 | +""" |
| 18 | +Ligand pose scoring using 'FeatureStein'. |
| 19 | +FeatureStein is a merged RDKit feature map that estimates the overlap of a ligand with a set of ligands (e.g. fragment |
| 20 | +screening hits) based on the RDKit feature maps. |
| 21 | +This modules generates merged features maps for a set of fragments and then scores a a set of ligands with that merged feature map. |
| 22 | +See featurestein_generate.py and featurestein_generate.py for how to separate these into separate processes. |
| 23 | +""" |
| 24 | + |
| 25 | +from __future__ import print_function |
| 26 | +import argparse, os, sys, gzip, pickle, traceback |
| 27 | + |
| 28 | +from rdkit import Chem, rdBase, RDConfig |
| 29 | +from rdkit.Chem import AllChem, rdShapeHelpers |
| 30 | +from rdkit.Chem.FeatMaps import FeatMaps |
| 31 | +from rdkit.Chem.FeatMaps.FeatMapUtils import CombineFeatMaps |
| 32 | + |
| 33 | +from pipelines_utils import parameter_utils, utils |
| 34 | +from pipelines_utils_rdkit import rdkit_utils |
| 35 | + |
| 36 | + |
| 37 | +### start function definitions ######################################### |
| 38 | + |
| 39 | +field_FeatureSteinQualityScore = "FeatureStein_Qual" |
| 40 | +field_FeatureSteinQuantityScore = "FeatureStein_Quant" |
| 41 | + |
| 42 | +# Setting up the features to use in FeatureMap |
| 43 | +ffact = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) |
| 44 | +fmParams = {} |
| 45 | +for k in ffact.GetFeatureFamilies(): |
| 46 | + fparams = FeatMaps.FeatMapParams() |
| 47 | + fmParams[k] = fparams |
| 48 | +exclude = [] |
| 49 | + |
| 50 | +fmaps = None |
| 51 | + |
| 52 | +def filter_feature(f): |
| 53 | + if f.GetFamily() in exclude: |
| 54 | + return None |
| 55 | + else: |
| 56 | + return f |
| 57 | + |
| 58 | +def get_raw_features(mol): |
| 59 | + rawFeats = ffact.GetFeaturesForMol(mol) |
| 60 | + # filter that list down to only include the ones we're interested in |
| 61 | + filtered = list(filter(filter_feature, rawFeats)) |
| 62 | + return filtered |
| 63 | + |
| 64 | +def create_feature_map(mol): |
| 65 | + feats = get_raw_features(mol) |
| 66 | + return FeatMaps.FeatMap(feats=feats, weights=[1]*len(feats),params=fmParams) |
| 67 | + |
| 68 | +def score_featmaps(fm1, fm2): |
| 69 | + """Generate the score for 2 feature maps""" |
| 70 | + if fm1.GetNumFeatures() == 0: |
| 71 | + return 0, 0 |
| 72 | + else: |
| 73 | + score = fm1.ScoreFeats(fm2.GetFeatures()) |
| 74 | + #utils.log(score, fm1.GetNumFeatures()) |
| 75 | + return score, score / fm1.GetNumFeatures() |
| 76 | + |
| 77 | +def get_fmap_scores(mol): |
| 78 | + "Score the molecule using the merged feature map" |
| 79 | + featMap = create_feature_map(mol) |
| 80 | + quantScore, qualScore = score_featmaps(fmaps, featMap) |
| 81 | + return quantScore, qualScore |
| 82 | + |
| 83 | +def build_feat_data(mols): |
| 84 | + "Build the feature maps for the molecules and do the all vs. all comparison" |
| 85 | + featuremaps = [] |
| 86 | + scores = [] |
| 87 | + for mol1 in mols: |
| 88 | + fm1 = create_feature_map(mol1) |
| 89 | + featuremaps.append(fm1) |
| 90 | + row = [] |
| 91 | + for mol2 in mols: |
| 92 | + fm2 = create_feature_map(mol2) |
| 93 | + raw_score, norm_score = score_featmaps(fm1, fm2) |
| 94 | + row.append(norm_score) |
| 95 | + #utils.log(len(data), len(row), raw_score, norm_score) |
| 96 | + scores.append(row) |
| 97 | + return featuremaps, scores |
| 98 | + |
| 99 | +def find_closest(scores): |
| 100 | + #utils.log('Find closest for', len(scores), len(scores[0])) |
| 101 | + best_score = -1.0 |
| 102 | + for i in range(len(scores)): |
| 103 | + for j in range(len(scores)): |
| 104 | + if i == j: |
| 105 | + continue |
| 106 | + score = scores[i][j] |
| 107 | + #utils.log('Score:', score) |
| 108 | + if score > best_score: |
| 109 | + best_score = score |
| 110 | + best_row = i |
| 111 | + best_col = j |
| 112 | + return best_score, best_row, best_col |
| 113 | + |
| 114 | +def merge_feat_maps(fmaps, scores): |
| 115 | + "Merge the 2 closest feature maps, remove them form the data and replace with the merged feature map" |
| 116 | + best_score, best_row, best_col = find_closest(scores) |
| 117 | + #utils.log(best_score, best_row, best_col) |
| 118 | + feat1 = fmaps[best_row] |
| 119 | + feat2 = fmaps[best_col] |
| 120 | + utils.log('Merging', best_row, 'and', best_col, 'with score', best_score, '#features:', feat1.GetNumFeatures(), feat2.GetNumFeatures()) |
| 121 | + merged = CombineFeatMaps(feat1, feat2, mergeMetric=1, mergeTol=1.5, dirMergeMode=0) |
| 122 | + # need to make sure we delete the biggest index first to avoid changing the smaller index |
| 123 | + if best_row > best_col: |
| 124 | + a = best_row |
| 125 | + b = best_col |
| 126 | + else: |
| 127 | + a = best_col |
| 128 | + b = best_row |
| 129 | + |
| 130 | + #utils.log('Initial:', len(fmaps), len(scores), ','.join([str(len(x)) for x in scores])) |
| 131 | + del fmaps[a] |
| 132 | + del fmaps[b] |
| 133 | + del scores[a] |
| 134 | + del scores[b] |
| 135 | + for row in scores: |
| 136 | + del row[a] |
| 137 | + del row[b] |
| 138 | + |
| 139 | + merged_scores = [] |
| 140 | + for i in range(len(fmaps)): |
| 141 | + fmap = fmaps[i] |
| 142 | + score1 = score_featmaps(fmap, merged)[1] |
| 143 | + score2 = score_featmaps(merged, fmap)[1] |
| 144 | + scores[i].append(score1) |
| 145 | + merged_scores.append(score2) |
| 146 | + |
| 147 | + fmaps.append(merged) |
| 148 | + merged_scores.append(score_featmaps(merged, merged)) |
| 149 | + scores.append(merged_scores) |
| 150 | + |
| 151 | +def score_molecules(molecules, writer): |
| 152 | + total = 0 |
| 153 | + success = 0 |
| 154 | + errors = 0 |
| 155 | + for mol in molecules: |
| 156 | + total += 1 |
| 157 | + if mol is None: |
| 158 | + errors += 1 |
| 159 | + continue |
| 160 | + try: |
| 161 | + quantScore, qualScore = get_fmap_scores(mol) |
| 162 | + # utils.log('Score:', score) |
| 163 | + if total % 1000 == 0: |
| 164 | + utils.log('Processed molecule', total, '...') |
| 165 | + mol.SetDoubleProp(field_FeatureSteinQualityScore, qualScore) |
| 166 | + mol.SetDoubleProp(field_FeatureSteinQuantityScore, quantScore) |
| 167 | + writer.write(mol) |
| 168 | + success += 1 |
| 169 | + except: |
| 170 | + utils.log("Error scoring molecule", sys.exc_info()[0]) |
| 171 | + traceback.print_exc() |
| 172 | + errors += 1 |
| 173 | + |
| 174 | + return total, success, errors |
| 175 | + |
| 176 | +def create_featuremap(fragments): |
| 177 | + |
| 178 | + mols = [m for m in fragments if m] |
| 179 | + fmaps, scores = build_feat_data(mols) |
| 180 | + merged_fmaps = fmaps.copy() |
| 181 | + utils.log('Processing', len(fmaps), 'molecules') |
| 182 | + while len(merged_fmaps) > 1: |
| 183 | + merge_feat_maps(merged_fmaps, scores) |
| 184 | + merged_fmap = merged_fmaps[0] |
| 185 | + utils.log('Created merged feature map with', merged_fmap.GetNumFeatures(), 'features') |
| 186 | + |
| 187 | + return merged_fmap |
| 188 | + |
| 189 | +### start main execution ######################################### |
| 190 | + |
| 191 | +def main(): |
| 192 | + |
| 193 | + # Example usage |
| 194 | + # python -m pipelines.xchem.featurestein_generate_and_score -i ../../data/mpro/poses.sdf.gz -f ../../data/mpro/hits-17.sdf.gz -o output_fs |
| 195 | + |
| 196 | + global fmaps |
| 197 | + |
| 198 | + parser = argparse.ArgumentParser(description='FeatureStein scoring with RDKit') |
| 199 | + parameter_utils.add_default_io_args(parser) |
| 200 | + parser.add_argument('-f', '--fragments', help='Fragments to use to generate the feature map') |
| 201 | + parser.add_argument('-ff', '--fragments-format', help='Fragments format') |
| 202 | + parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed') |
| 203 | + parser.add_argument('--metrics', action='store_true', help='Write metrics') |
| 204 | + |
| 205 | + |
| 206 | + args = parser.parse_args() |
| 207 | + utils.log("FeatureStein Args: ", args) |
| 208 | + |
| 209 | + source = "featurestein_generate_and_score.py" |
| 210 | + datasetMetaProps = {"source":source, "description": "FeatureStein scoring using RDKit " + rdBase.rdkitVersion} |
| 211 | + |
| 212 | + clsMappings = {} |
| 213 | + fieldMetaProps = [] |
| 214 | + clsMappings[field_FeatureSteinQualityScore] = "java.lang.Float" |
| 215 | + clsMappings[field_FeatureSteinQuantityScore] = "java.lang.Float" |
| 216 | + fieldMetaProps.append({"fieldName":field_FeatureSteinQualityScore, "values": {"source":source, "description":"FeatureStein quality score"}, |
| 217 | + "fieldName":field_FeatureSteinQuantityScore, "values": {"source":source, "description":"FeatureStein quantity score"}}) |
| 218 | + |
| 219 | + # generate the feature maps |
| 220 | + frags_input, frags_suppl = rdkit_utils.default_open_input(args.fragments, args.fragments_format) |
| 221 | + |
| 222 | + fmaps = create_featuremap(frags_suppl) |
| 223 | + frags_input.close() |
| 224 | + |
| 225 | + # read the ligands to be scored |
| 226 | + inputs_file, inputs_supplr = rdkit_utils.default_open_input(args.input, args.informat) |
| 227 | + output, writer, output_base = rdkit_utils.default_open_output(args.output, |
| 228 | + 'featurestein', args.outformat, |
| 229 | + valueClassMappings=clsMappings, |
| 230 | + datasetMetaProps=datasetMetaProps, |
| 231 | + fieldMetaProps=fieldMetaProps, |
| 232 | + compress=not args.no_gzip) |
| 233 | + |
| 234 | + # do the scoring |
| 235 | + total, success, errors = score_molecules(inputs_supplr, writer) |
| 236 | + utils.log('Scored', success, 'molecules.', errors, 'errors.') |
| 237 | + |
| 238 | + inputs_file.close() |
| 239 | + writer.flush() |
| 240 | + writer.close() |
| 241 | + output.close() |
| 242 | + |
| 243 | + if args.metrics: |
| 244 | + utils.write_metrics(output_base, {'__InputCount__':total, '__OutputCount__':success, '__ErrorCount__':errors, 'RDKitFeatureMap':success}) |
| 245 | + |
| 246 | + |
| 247 | +if __name__ == "__main__": |
| 248 | + main() |
0 commit comments