Skip to content

Commit 131d4fa

Browse files
committed
featurestein and xcos scoring
1 parent 7dbe6ca commit 131d4fa

File tree

7 files changed

+797
-0
lines changed

7 files changed

+797
-0
lines changed

data/mpro/hits-17.sdf.gz

3.74 KB
Binary file not shown.

data/mpro/poses.sdf.gz

5.54 KB
Binary file not shown.

requirements-rdkit.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,5 @@ im-pipelines-utils-rdkit==1.5.*
33
matplotlib==2.2.*
44
molvs==0.1.1
55
standardiser==0.1.9
6+
pandas==1.0.1
7+
scikit-learn==0.22.1

src/nextflow/rdkit/xcos.nf

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
#!/usr/bin/env nextflow
2+
3+
params.inputs = "data/mpro/poses.sdf"
4+
params.fragments = "data/mpro/hits-17.sdf"
5+
params.chunk = 500
6+
params.limit = 0
7+
params.digits = 4
8+
9+
inputs = file(params.inputs)
10+
fragments = file(params.fragments)
11+
12+
process splitter {
13+
14+
container 'informaticsmatters/rdkit_pipelines:latest'
15+
16+
input:
17+
file inputs
18+
19+
output:
20+
file 'inputs_part*.sdf.gz' into inputs_parts mode flatten
21+
22+
"""
23+
python -m pipelines_utils_rdkit.filter -i '$inputs' -c $params.chunk -l $params.limit -d $params.digits -o 'inputs_part_' -of sdf
24+
"""
25+
}
26+
27+
process xcos {
28+
29+
container 'informaticsmatters/rdkit_pipelines:latest'
30+
31+
input:
32+
file part from inputs_parts
33+
file fragments
34+
35+
output:
36+
file 'scored_part*.sdf' into scored_parts
37+
38+
"""
39+
python -m pipelines.rdkit.xcos -i '$part' -f '$fragments' -o '${part.name.replace('inputs', 'scored')[0..-8]}' -of sdf --no-gzip
40+
"""
41+
}
42+
43+
process joiner {
44+
45+
container 'informaticsmatters/rdkit_pipelines:latest'
46+
47+
publishDir ".", mode: 'move'
48+
49+
input:
50+
file parts from scored_parts.collect()
51+
52+
output:
53+
file 'xcos_scored.sdf.gz'
54+
55+
"""
56+
cat '$parts' | gzip > xcos_scored.sdf.gz
57+
"""
58+
}
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2020 Informatics Matters Ltd.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
"""
18+
Ligand pose scoring using 'FeatureStein'.
19+
This module generates a merged feature map from a set of 3D ligands.
20+
The output is a pickle of the merged feature map that can be read by the featurestein-score.py module to
21+
generate scores.
22+
"""
23+
24+
from __future__ import print_function
25+
import argparse, os, sys, gzip, pickle
26+
27+
from rdkit import Chem, rdBase, RDConfig
28+
from rdkit.Chem import AllChem, rdShapeHelpers
29+
from rdkit.Chem.FeatMaps import FeatMaps
30+
from rdkit.Chem.FeatMaps.FeatMapUtils import CombineFeatMaps
31+
32+
from pipelines_utils import parameter_utils, utils
33+
from pipelines_utils_rdkit import rdkit_utils
34+
35+
36+
### start function definitions #########################################
37+
38+
ffact = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
39+
40+
fmParams = {}
41+
for k in ffact.GetFeatureFamilies():
42+
fparams = FeatMaps.FeatMapParams()
43+
fmParams[k] = fparams
44+
45+
exclude = ()
46+
47+
def filterFeature(f):
48+
if f.GetFamily() in exclude:
49+
return None
50+
else:
51+
return f
52+
53+
def getRawFeatures(mol):
54+
rawFeats = ffact.GetFeaturesForMol(mol)
55+
# filter that list down to only include the ones we're interested in
56+
filtered = list(filter(filterFeature, rawFeats))
57+
return filtered
58+
59+
def getFeatureMap(mol):
60+
feats = getRawFeatures(mol)
61+
return FeatMaps.FeatMap(feats=feats, weights=[1]*len(feats),params=fmParams)
62+
63+
def score_featmaps(fm1, fm2):
64+
"Generate the score for 2 feature maps"
65+
return fm1.ScoreFeats(fm2.GetFeatures()) / fm1.GetNumFeatures()
66+
67+
def build_feat_data(mols):
68+
"Build the feature maps and do the all vs. all comparison"
69+
fmaps = []
70+
scores = []
71+
for mol1 in mols:
72+
fm1 = getFeatureMap(mol1)
73+
fmaps.append(fm1)
74+
row = []
75+
for mol2 in mols:
76+
fm2 = getFeatureMap(mol2)
77+
score = score_featmaps(fm1, fm2)
78+
row.append(score)
79+
#print(len(data), len(row), score)
80+
scores.append(row)
81+
return fmaps, scores
82+
83+
def find_closest(scores):
84+
#print('Find closest for', len(scores), len(scores[0]))
85+
best_score = 0
86+
for i in range(len(scores)):
87+
for j in range(len(scores)):
88+
if i == j:
89+
continue
90+
score = scores[i][j]
91+
if score > best_score:
92+
best_score = score
93+
best_row = i
94+
best_col = j
95+
return best_score, best_row, best_col
96+
97+
def merge_feat_maps(fmaps, scores):
98+
"Merge the 2 closest feature maps, remove them form the data and replace with the merged feature map"
99+
best_score, best_row, best_col = find_closest(scores)
100+
#print(best_score, best_row, best_col)
101+
feat1 = fmaps[best_row]
102+
feat2 = fmaps[best_col]
103+
utils.log('Merging', best_row, 'and', best_col, 'with score', best_score, '#features:', feat1.GetNumFeatures(), feat2.GetNumFeatures())
104+
merged = CombineFeatMaps(feat1, feat2, mergeMetric=1, mergeTol=1.5, dirMergeMode=0)
105+
# need to make sure we delete the biggest index first to avoid changing the smaller index
106+
if best_row > best_col:
107+
a = best_row
108+
b = best_col
109+
else:
110+
a = best_col
111+
b = best_row
112+
113+
#print('Initial:', len(fmaps), len(scores), ','.join([str(len(x)) for x in scores]))
114+
del fmaps[a]
115+
del fmaps[b]
116+
del scores[a]
117+
del scores[b]
118+
for row in scores:
119+
del row[a]
120+
del row[b]
121+
122+
merged_scores = []
123+
for i in range(len(fmaps)):
124+
fmap = fmaps[i]
125+
score1 = score_featmaps(fmap, merged)
126+
score2 = score_featmaps(merged, fmap)
127+
scores[i].append(score1)
128+
merged_scores.append(score2)
129+
130+
fmaps.append(merged)
131+
merged_scores.append(score_featmaps(merged, merged))
132+
scores.append(merged_scores)
133+
134+
135+
def process(inputs, fname):
136+
137+
mols = [m for m in inputs if m]
138+
fmaps, scores = build_feat_data(mols)
139+
merged_fmaps = fmaps.copy()
140+
utils.log('Processing', len(fmaps), 'molecules')
141+
while len(merged_fmaps) > 1:
142+
merge_feat_maps(merged_fmaps, scores)
143+
merged_fmap = merged_fmaps[0]
144+
pickle.dump(merged_fmap, open(fname, "wb" ))
145+
utils.log('Wrote merged feature map with', merged_fmap.GetNumFeatures(), 'features as pickle to', fname)
146+
147+
return len(mols), merged_fmap.GetNumFeatures()
148+
149+
### start main execution #########################################
150+
151+
def main():
152+
153+
global fmaps
154+
155+
parser = argparse.ArgumentParser(description='FeatureStein generation with RDKit')
156+
parameter_utils.add_default_input_args(parser)
157+
parser.add_argument('-f', '--feat-map', default='featurestein.p', help='Name of pickle to generate')
158+
parser.add_argument('--metrics', action='store_true', help='Write metrics')
159+
160+
args = parser.parse_args()
161+
utils.log("FeatureStein Args: ", args)
162+
163+
inputs_file, inputs_supplr = rdkit_utils. \
164+
default_open_input(args.input, args.informat)
165+
166+
# this does the processing
167+
num_mols, num_feats = process(inputs_supplr, args.feat_map)
168+
169+
inputs_file.close()
170+
171+
if args.metrics:
172+
utils.write_metrics(output_base, {'__StatusMessage__': 'Generated ' + num_feats + ' from ' + num_mols + ' molecules',
173+
'__InputCount__':num_mols, 'RDKitFeatureMap':num_mols})
174+
175+
176+
if __name__ == "__main__":
177+
main()
Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
#!/usr/bin/env python
2+
3+
# Copyright 2020 Informatics Matters Ltd.
4+
#
5+
# Licensed under the Apache License, Version 2.0 (the "License");
6+
# you may not use this file except in compliance with the License.
7+
# You may obtain a copy of the License at
8+
#
9+
# http://www.apache.org/licenses/LICENSE-2.0
10+
#
11+
# Unless required by applicable law or agreed to in writing, software
12+
# distributed under the License is distributed on an "AS IS" BASIS,
13+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
# See the License for the specific language governing permissions and
15+
# limitations under the License.
16+
17+
"""
18+
Ligand pose scoring using 'FeatureStein'.
19+
FeatureStein is a merged RDKit feature map that estimates the overlap of a ligand with a set of ligands (e.g. fragment
20+
screening hits) based on the RDKit feature maps.
21+
See featurestein-generate.py for how to generate the merged feature maps.
22+
"""
23+
24+
from __future__ import print_function
25+
import argparse, os, sys, gzip, pickle, traceback
26+
from rdkit import Chem, rdBase, RDConfig
27+
from rdkit.Chem import AllChem, rdShapeHelpers
28+
from rdkit.Chem.FeatMaps import FeatMaps
29+
from pipelines_utils import parameter_utils, utils
30+
from pipelines_utils_rdkit import rdkit_utils
31+
32+
33+
### start function definitions #########################################
34+
35+
field_FeatureSteinScore = "FeatureStein_Score"
36+
37+
# Setting up the features to use in FeatureMap
38+
ffact = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
39+
fmaps = None
40+
41+
42+
def filter_feature(f):
43+
result = f.GetFamily() in fmaps.params.keys()
44+
return result
45+
46+
def get_raw_features(mol):
47+
rawFeats = ffact.GetFeaturesForMol(mol)
48+
# filter that list down to only include the ones we're interested in
49+
filtered = list(filter(filter_feature, rawFeats))
50+
return filtered
51+
52+
def create_feature_map(mol):
53+
feats = get_raw_features(mol)
54+
return FeatMaps.FeatMap(feats=feats, weights=[1]*len(feats),params=fmaps.params)
55+
56+
def score_featmaps(fm1):
57+
"Generate the score for 2 feature maps"
58+
if fm1.GetNumFeatures() == 0:
59+
return 0
60+
else:
61+
score = fm1.ScoreFeats(fmaps.GetFeatures())
62+
#utils.log(score, fm1.GetNumFeatures())
63+
return score / fm1.GetNumFeatures()
64+
65+
def get_fmap_score(mol):
66+
featMap = create_feature_map(mol)
67+
score = score_featmaps(featMap)
68+
return score
69+
70+
def process(inputs, writer):
71+
total = 0
72+
success = 0
73+
errors = 0
74+
for mol in inputs:
75+
total += 1
76+
if mol is None:
77+
errors += 1
78+
continue
79+
try:
80+
score = get_fmap_score(mol)
81+
# utils.log('Score:', score)
82+
if total % 1000 == 0:
83+
utils.log('Processed molecule', total, '...')
84+
mol.SetDoubleProp(field_FeatureSteinScore, score)
85+
writer.write(mol)
86+
success += 1
87+
except:
88+
utils.log("Error scoring molecule", sys.exc_info()[0])
89+
traceback.print_exc()
90+
errors += 1
91+
92+
return total, success, errors
93+
94+
### start main execution #########################################
95+
96+
def main():
97+
98+
global fmaps
99+
100+
parser = argparse.ArgumentParser(description='FeatureStein scoring with RDKit')
101+
parameter_utils.add_default_io_args(parser)
102+
parser.add_argument('-f', '--feat-map', help='Feature Map pickle to score with')
103+
parser.add_argument('--metrics', action='store_true', help='Write metrics')
104+
105+
106+
args = parser.parse_args()
107+
utils.log("FeatureStein Args: ", args)
108+
109+
source = "featurestein-score.py"
110+
datasetMetaProps = {"source":source, "description": "FeatureStein scoring using RDKit " + rdBase.rdkitVersion}
111+
112+
clsMappings = {}
113+
fieldMetaProps = []
114+
clsMappings[field_FeatureSteinScore] = "java.lang.Float"
115+
fieldMetaProps.append({"fieldName":field_FeatureSteinScore, "values": {"source":source, "description":"FeatureStein score"}})
116+
117+
pkl_file = open(args.feat_map, 'rb')
118+
fmaps = pickle.load(pkl_file)
119+
utils.log('FeatureMap has', fmaps.GetNumFeatures(), "features")
120+
121+
inputs_file,output,inputs_supplr,writer,output_base = rdkit_utils. \
122+
default_open_input_output(args.input, args.informat, args.output,
123+
'featurestein', args.outformat,
124+
valueClassMappings=clsMappings,
125+
datasetMetaProps=datasetMetaProps,
126+
fieldMetaProps=fieldMetaProps)
127+
128+
# this does the processing
129+
total, success, errors = process(inputs_supplr, writer)
130+
131+
inputs_file.close()
132+
writer.flush()
133+
writer.close()
134+
output.close()
135+
136+
if args.metrics:
137+
utils.write_metrics(output_base, {'__InputCount__':total, '__OutputCount__':success, '__ErrorCount__':errors, 'RDKitFeatureMap':success})
138+
139+
140+
if __name__ == "__main__":
141+
main()

0 commit comments

Comments
 (0)