Skip to content

Commit 3f20c0e

Browse files
committed
created combined featurestein generate and score
1 parent 8563f24 commit 3f20c0e

File tree

3 files changed

+271
-2
lines changed

3 files changed

+271
-2
lines changed

src/python/pipelines/xchem/featurestein_generate.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
fparams = FeatMaps.FeatMapParams()
4343
fmParams[k] = fparams
4444

45-
exclude = ()
45+
exclude = []
4646

4747
def filterFeature(f):
4848
if f.GetFamily() in exclude:
@@ -88,7 +88,7 @@ def find_closest(scores):
8888
if i == j:
8989
continue
9090
score = scores[i][j]
91-
utils.log('Score:', score)
91+
#utils.log('Score:', score)
9292
if score > best_score:
9393
best_score = score
9494
best_row = i
Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2020 Informatics Matters Ltd.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
"""
18+
Ligand pose scoring using 'FeatureStein'.
19+
FeatureStein is a merged RDKit feature map that estimates the overlap of a ligand with a set of ligands (e.g. fragment
20+
screening hits) based on the RDKit feature maps.
21+
This modules generates merged features maps for a set of fragments and then scores a a set of ligands with that merged feature map.
22+
See featurestein_generate.py and featurestein_generate.py for how to separate these into separate processes.
23+
"""
24+
25+
from __future__ import print_function
26+
import argparse, os, sys, gzip, pickle, traceback
27+
28+
from rdkit import Chem, rdBase, RDConfig
29+
from rdkit.Chem import AllChem, rdShapeHelpers
30+
from rdkit.Chem.FeatMaps import FeatMaps
31+
from rdkit.Chem.FeatMaps.FeatMapUtils import CombineFeatMaps
32+
33+
from pipelines_utils import parameter_utils, utils
34+
from pipelines_utils_rdkit import rdkit_utils
35+
36+
37+
### start function definitions #########################################
38+
39+
field_FeatureSteinQualityScore = "FeatureStein_Qual"
40+
field_FeatureSteinQuantityScore = "FeatureStein_Quant"
41+
42+
# Setting up the features to use in FeatureMap
43+
ffact = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
44+
fmParams = {}
45+
for k in ffact.GetFeatureFamilies():
46+
fparams = FeatMaps.FeatMapParams()
47+
fmParams[k] = fparams
48+
exclude = []
49+
50+
fmaps = None
51+
52+
def filter_feature(f):
53+
if f.GetFamily() in exclude:
54+
return None
55+
else:
56+
return f
57+
58+
def get_raw_features(mol):
59+
rawFeats = ffact.GetFeaturesForMol(mol)
60+
# filter that list down to only include the ones we're interested in
61+
filtered = list(filter(filter_feature, rawFeats))
62+
return filtered
63+
64+
def create_feature_map(mol):
65+
feats = get_raw_features(mol)
66+
return FeatMaps.FeatMap(feats=feats, weights=[1]*len(feats),params=fmParams)
67+
68+
def score_featmaps(fm1, fm2):
69+
"""Generate the score for 2 feature maps"""
70+
if fm1.GetNumFeatures() == 0:
71+
return 0, 0
72+
else:
73+
score = fm1.ScoreFeats(fm2.GetFeatures())
74+
#utils.log(score, fm1.GetNumFeatures())
75+
return score, score / fm1.GetNumFeatures()
76+
77+
def get_fmap_scores(mol):
78+
"Score the molecule using the merged feature map"
79+
featMap = create_feature_map(mol)
80+
quantScore, qualScore = score_featmaps(fmaps, featMap)
81+
return quantScore, qualScore
82+
83+
def build_feat_data(mols):
84+
"Build the feature maps for the molecules and do the all vs. all comparison"
85+
featuremaps = []
86+
scores = []
87+
for mol1 in mols:
88+
fm1 = create_feature_map(mol1)
89+
featuremaps.append(fm1)
90+
row = []
91+
for mol2 in mols:
92+
fm2 = create_feature_map(mol2)
93+
raw_score, norm_score = score_featmaps(fm1, fm2)
94+
row.append(norm_score)
95+
#utils.log(len(data), len(row), raw_score, norm_score)
96+
scores.append(row)
97+
return featuremaps, scores
98+
99+
def find_closest(scores):
100+
#utils.log('Find closest for', len(scores), len(scores[0]))
101+
best_score = -1.0
102+
for i in range(len(scores)):
103+
for j in range(len(scores)):
104+
if i == j:
105+
continue
106+
score = scores[i][j]
107+
#utils.log('Score:', score)
108+
if score > best_score:
109+
best_score = score
110+
best_row = i
111+
best_col = j
112+
return best_score, best_row, best_col
113+
114+
def merge_feat_maps(fmaps, scores):
115+
"Merge the 2 closest feature maps, remove them form the data and replace with the merged feature map"
116+
best_score, best_row, best_col = find_closest(scores)
117+
#utils.log(best_score, best_row, best_col)
118+
feat1 = fmaps[best_row]
119+
feat2 = fmaps[best_col]
120+
utils.log('Merging', best_row, 'and', best_col, 'with score', best_score, '#features:', feat1.GetNumFeatures(), feat2.GetNumFeatures())
121+
merged = CombineFeatMaps(feat1, feat2, mergeMetric=1, mergeTol=1.5, dirMergeMode=0)
122+
# need to make sure we delete the biggest index first to avoid changing the smaller index
123+
if best_row > best_col:
124+
a = best_row
125+
b = best_col
126+
else:
127+
a = best_col
128+
b = best_row
129+
130+
#utils.log('Initial:', len(fmaps), len(scores), ','.join([str(len(x)) for x in scores]))
131+
del fmaps[a]
132+
del fmaps[b]
133+
del scores[a]
134+
del scores[b]
135+
for row in scores:
136+
del row[a]
137+
del row[b]
138+
139+
merged_scores = []
140+
for i in range(len(fmaps)):
141+
fmap = fmaps[i]
142+
score1 = score_featmaps(fmap, merged)[1]
143+
score2 = score_featmaps(merged, fmap)[1]
144+
scores[i].append(score1)
145+
merged_scores.append(score2)
146+
147+
fmaps.append(merged)
148+
merged_scores.append(score_featmaps(merged, merged))
149+
scores.append(merged_scores)
150+
151+
def score_molecules(molecules, writer):
152+
total = 0
153+
success = 0
154+
errors = 0
155+
for mol in molecules:
156+
total += 1
157+
if mol is None:
158+
errors += 1
159+
continue
160+
try:
161+
quantScore, qualScore = get_fmap_scores(mol)
162+
# utils.log('Score:', score)
163+
if total % 1000 == 0:
164+
utils.log('Processed molecule', total, '...')
165+
mol.SetDoubleProp(field_FeatureSteinQualityScore, qualScore)
166+
mol.SetDoubleProp(field_FeatureSteinQuantityScore, quantScore)
167+
writer.write(mol)
168+
success += 1
169+
except:
170+
utils.log("Error scoring molecule", sys.exc_info()[0])
171+
traceback.print_exc()
172+
errors += 1
173+
174+
return total, success, errors
175+
176+
def create_featuremap(fragments):
177+
178+
mols = [m for m in fragments if m]
179+
fmaps, scores = build_feat_data(mols)
180+
merged_fmaps = fmaps.copy()
181+
utils.log('Processing', len(fmaps), 'molecules')
182+
while len(merged_fmaps) > 1:
183+
merge_feat_maps(merged_fmaps, scores)
184+
merged_fmap = merged_fmaps[0]
185+
utils.log('Created merged feature map with', merged_fmap.GetNumFeatures(), 'features')
186+
187+
return merged_fmap
188+
189+
### start main execution #########################################
190+
191+
def main():
192+
193+
# Example usage
194+
# python -m pipelines.xchem.featurestein_generate_and_score -i ../../data/mpro/poses.sdf.gz -f ../../data/mpro/hits-17.sdf.gz -o output_fs
195+
196+
global fmaps
197+
198+
parser = argparse.ArgumentParser(description='FeatureStein scoring with RDKit')
199+
parameter_utils.add_default_io_args(parser)
200+
parser.add_argument('-f', '--fragments', help='Fragments to use to generate the feature map')
201+
parser.add_argument('-ff', '--fragments-format', help='Fragments format')
202+
parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed')
203+
parser.add_argument('--metrics', action='store_true', help='Write metrics')
204+
205+
206+
args = parser.parse_args()
207+
utils.log("FeatureStein Args: ", args)
208+
209+
source = "featurestein_generate_and_score.py"
210+
datasetMetaProps = {"source":source, "description": "FeatureStein scoring using RDKit " + rdBase.rdkitVersion}
211+
212+
clsMappings = {}
213+
fieldMetaProps = []
214+
clsMappings[field_FeatureSteinQualityScore] = "java.lang.Float"
215+
clsMappings[field_FeatureSteinQuantityScore] = "java.lang.Float"
216+
fieldMetaProps.append({"fieldName":field_FeatureSteinQualityScore, "values": {"source":source, "description":"FeatureStein quality score"},
217+
"fieldName":field_FeatureSteinQuantityScore, "values": {"source":source, "description":"FeatureStein quantity score"}})
218+
219+
# generate the feature maps
220+
frags_input, frags_suppl = rdkit_utils.default_open_input(args.fragments, args.fragments_format)
221+
222+
fmaps = create_featuremap(frags_suppl)
223+
frags_input.close()
224+
225+
# read the ligands to be scored
226+
inputs_file, inputs_supplr = rdkit_utils.default_open_input(args.input, args.informat)
227+
output, writer, output_base = rdkit_utils.default_open_output(args.output,
228+
'featurestein', args.outformat,
229+
valueClassMappings=clsMappings,
230+
datasetMetaProps=datasetMetaProps,
231+
fieldMetaProps=fieldMetaProps,
232+
compress=not args.no_gzip)
233+
234+
# do the scoring
235+
total, success, errors = score_molecules(inputs_supplr, writer)
236+
utils.log('Scored', success, 'molecules.', errors, 'errors.')
237+
238+
inputs_file.close()
239+
writer.flush()
240+
writer.close()
241+
output.close()
242+
243+
if args.metrics:
244+
utils.write_metrics(output_base, {'__InputCount__':total, '__OutputCount__':success, '__ErrorCount__':errors, 'RDKitFeatureMap':success})
245+
246+
247+
if __name__ == "__main__":
248+
main()
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
// Automated pipeline test specification.
2+
3+
[
4+
5+
version = 1,
6+
7+
// Testing featurestein_generate_and_score.py
8+
// reading from STDIN and writing to STDOUT
9+
test_featurestein_generate_and_score = [
10+
11+
command: '''python -m pipelines.xchem.featurestein_generate_and_score
12+
-f ${PIN}/mpro/hits-17.sdf.gz
13+
-i ${PIN}/mpro/poses.sdf.gz
14+
-o ${POUT}FSOUT''',
15+
16+
stderr: [ 'Scored 14 molecules. 0 errors.' ],
17+
18+
creates: [ 'FSOUT.sdf.gz' ]
19+
]
20+
21+
]

0 commit comments

Comments
 (0)