Skip to content

Commit d2e123b

Browse files
committed
minor tweaks to enumerate_charges and sucos
1 parent 2a2a55b commit d2e123b

File tree

6 files changed

+110
-80
lines changed

6 files changed

+110
-80
lines changed

README.md

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -215,20 +215,26 @@ But as a quick start you should be able to run the the tests in a conda environm
215215

216216
Create a conda environment containing RDKit:
217217
```
218-
conda env create -f environment-rdkit.yml
218+
conda env create -f environment-rdkit-utils.yml
219219
```
220-
221-
Move into the `pipelines-utils` repo (this should be alongside `pipelines` and `pipelines-utils-rdkit`):
220+
Now activate that environment:
222221
```
223-
cd ../pipleines-utils
222+
conda activate pipelines-utils
224223
```
225224

226-
Set your `PYTHONPATH` environment variable to include the `pipelines-utils` and `pipelines-utils-rdkit` sources
225+
Note: this environment includes pipeline-utils and pipeline-utils-rdkit from PyPi.
226+
If you need to use changes from these repos you will need to create a conda environment that does not contain these and
227+
instead set your `PYTHONPATH` environment variable to include the `pipelines-utils` and `pipelines-utils-rdkit` sources
227228
(adjusting `/path/to/` to whatever is needed):
228229
```
229230
export PYTHONPATH=/path/to/pipelines-utils/src/python:/path/to/pipelines-utils-rdkit/src/python
230231
```
231232

233+
Move into the `pipelines-utils` repo (this should be alongside `pipelines` and `pipelines-utils-rdkit`):
234+
```
235+
cd ../pipelines-utils
236+
```
237+
232238
Run tests:
233239
```
234240
./gradlew runPipelineTester -Pptargs=-opipelines

environment-rdkit.yml renamed to environment-rdkit-utils.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
name: pipelines
1+
name: pipelines-utils
22
channels:
33
- conda-forge
44
dependencies:
@@ -11,3 +11,5 @@ dependencies:
1111
- matplotlib==2.2.*
1212
- molvs==0.1.1
1313
- standardiser==0.1.9
14+
- im-pipelines-utils
15+
- im-pipelines-utils-rdkit

src/python/pipelines/dimorphite/enumerate_charges.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,6 @@ def add_src_mol_ref(src_mol, target_mol, index):
6969
### start main execution #########################################
7070

7171
def main():
72-
7372
### command line args definitions #########################################
7473

7574
parser = argparse.ArgumentParser(description='Enumerate charges')
@@ -87,14 +86,14 @@ def main():
8786

8887
# handle metadata
8988
source = "enumerate_charges.py"
90-
datasetMetaProps = {"source":source, "description": "Enumerate charges using Dimorphite-dl"}
89+
datasetMetaProps = {"source": source, "description": "Enumerate charges using Dimorphite-dl"}
9190
clsMappings = {
9291
"EnumChargesSrcMolUUID": "java.lang.String",
9392
"EnumChargesSrcMolIdx": "java.lang.Integer"
9493
}
9594
fieldMetaProps = [
96-
{"fieldName":"EnumChargesSrcMolUUID", "values": {"source":source, "description":"UUID of source molecule"}},
97-
{"fieldName":"EnumChargesSrcMolIdx", "values": {"source":source, "description":"Index of source molecule"}}
95+
{"fieldName": "EnumChargesSrcMolUUID", "values": {"source": source, "description": "UUID of source molecule"}},
96+
{"fieldName": "EnumChargesSrcMolIdx", "values": {"source": source, "description": "Index of source molecule"}}
9897
]
9998

10099
oformat = utils.determine_output_format(args.outformat)
@@ -121,6 +120,8 @@ def main():
121120
dimorphite_sys_argv.append(str(max_ph))
122121
fragment = args.fragment_method
123122
for mol in suppl:
123+
if mol is None:
124+
continue
124125
count += 1
125126
orig_sys_argv = sys.argv[:]
126127
sys.argv = dimorphite_sys_argv
@@ -132,7 +133,8 @@ def main():
132133

133134
utils.log(count, total, errors)
134135

135-
input.close()
136+
if input:
137+
input.close()
136138
writer.flush()
137139
writer.close()
138140
output.close()
@@ -142,7 +144,9 @@ def main():
142144
utils.write_squonk_datasetmetadata(output_base, False, clsMappings, datasetMetaProps, fieldMetaProps, size=total)
143145

144146
if args.meta:
145-
utils.write_metrics(output_base, {'__InputCount__':count, '__OutputCount__':total, '__ErrorCount__':errors, 'EnumerateChargesDimporphite':total})
147+
utils.write_metrics(output_base, {'__InputCount__': count, '__OutputCount__': total, '__ErrorCount__': errors,
148+
'EnumerateChargesDimporphite': total})
149+
146150

147151
if __name__ == "__main__":
148152
main()

src/python/pipelines/rdkit/sucos.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def get_SucosScore(ref_mol, query_mol, tani=False, ref_features=None, query_feat
9999
to recalculate them. Use the getRawFeatures function to pre-calculate the features.
100100
101101
:param ref_mol: The reference molecule to compare to
102-
:param query_mol: The molecule to align to the reference
102+
:param query_mol: The molecule to compare to the reference
103103
:param tani: Whether to calculate Tanimoto distances
104104
:param ref_features: An optional feature map for the reference molecule, avoiding the need to re-calculate it.
105105
:param query_features: An optional feature map for the query molecule, avoiding the need to re-calculate it.
@@ -115,17 +115,21 @@ def get_SucosScore(ref_mol, query_mol, tani=False, ref_features=None, query_feat
115115
fm_score = get_FeatureMapScore(ref_features, query_features, tani, score_mode)
116116
fm_score = np.clip(fm_score, 0, 1)
117117

118-
if tani:
119-
tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol))
120-
tani_sim = np.clip(tani_sim, 0, 1)
121-
SuCOS_score = 0.5*fm_score + 0.5*tani_sim
122-
return SuCOS_score, fm_score, tani_sim
123-
else:
124-
protrude_dist = rdShapeHelpers.ShapeProtrudeDist(ref_mol, query_mol, allowReordering=False)
125-
protrude_dist = np.clip(protrude_dist, 0, 1)
126-
protrude_val = 1.0 - protrude_dist
127-
SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val
128-
return SuCOS_score, fm_score, protrude_val
118+
try :
119+
if tani:
120+
tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol))
121+
tani_sim = np.clip(tani_sim, 0, 1)
122+
SuCOS_score = 0.5*fm_score + 0.5*tani_sim
123+
return SuCOS_score, fm_score, tani_sim
124+
else:
125+
protrude_dist = rdShapeHelpers.ShapeProtrudeDist(ref_mol, query_mol, allowReordering=False)
126+
protrude_dist = np.clip(protrude_dist, 0, 1)
127+
protrude_val = 1.0 - protrude_dist
128+
SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val
129+
return SuCOS_score, fm_score, protrude_val
130+
except:
131+
utils.log("Failed to calculate SuCOS scores. Returning 0,0,0")
132+
return 0, 0, 0
129133

130134
def process(target_mol, inputs_supplr, writer, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All):
131135

src/python/pipelines/rdkit/sucos.test

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,23 @@
1212
-i ${PIN}/sucos/benzene.sdf -if sdf
1313
-o ${POUT}output -of sdf''',
1414

15-
stderr: [ 'Scores: 0.843' ],
15+
stderr: [ 'Scores: 0.8' ],
1616

1717
creates: [ 'output.sdf.gz' ],
1818

1919
],
2020

21-
test_raw_sucos_sdf_pick_target_1 = [
21+
test_raw_sucos_sdf_pick_target_1 = [
2222

23-
command: '''python -m pipelines.rdkit.sucos
24-
--target ${PIN}/sucos/mols.sdf
25-
--targetidx 1
26-
-i ${PIN}/sucos/mols.sdf -if sdf
27-
-o ${POUT}output -of sdf''',
23+
command: '''python -m pipelines.rdkit.sucos
24+
--target ${PIN}/sucos/mols.sdf
25+
--targetidx 1
26+
-i ${PIN}/sucos/mols.sdf -if sdf
27+
-o ${POUT}output -of sdf''',
2828

29-
stderr: [ 'Scores: 0.843' ],
29+
stderr: [ 'Scores: 0.8' ],
3030

31-
creates: [ 'output.sdf.gz' ],
31+
creates: [ 'output.sdf.gz' ],
3232

3333
],
3434

src/python/pipelines/rdkit/sucos_max.py

Lines changed: 61 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,7 @@
5252
field_SuCOSCum_ProtrudeScore = "SuCOS_Cum_Protrude_Score"
5353

5454

55-
def process(inputs_supplr, targets_supplr, writer, field_name):
56-
55+
def process(inputs_supplr, targets_supplr, writer, field_name, filter_value, filter_field):
5756
cluster = []
5857
mol_ids = []
5958
i = 0
@@ -72,6 +71,7 @@ def process(inputs_supplr, targets_supplr, writer, field_name):
7271
cluster.append((mol, features))
7372
except:
7473
utils.log("WARNING: failed to generate features for molecule", i, sys.exc_info())
74+
utils.log("Generated features for", len(cluster), "molecules")
7575

7676
comparisons = 0
7777
mol_num = 0
@@ -90,51 +90,53 @@ def process(inputs_supplr, targets_supplr, writer, field_name):
9090
errors += 1
9191
continue
9292

93-
max_scores = [0, 0, 0]
94-
cum_scores = [0, 0, 0]
95-
best_id = None
93+
scores_max = [0, 0, 0]
94+
scores_cum = [0, 0, 0]
9695

9796
index = 0
9897
for entry in cluster:
9998
hit = entry[0]
10099
ref_features = entry[1]
101-
102100
comparisons += 1
103101
sucos_score, fm_score, vol_score = sucos.get_SucosScore(hit, mol,
104-
tani=False, ref_features=ref_features, query_features=query_features)
102+
tani=False, ref_features=ref_features,
103+
query_features=query_features)
105104

106-
if sucos_score > max_scores[0]:
107-
max_scores[0] = sucos_score
108-
max_scores[1] = fm_score
109-
max_scores[2] = vol_score
105+
if sucos_score > scores_max[0]:
106+
scores_max[0] = sucos_score
107+
scores_max[1] = fm_score
108+
scores_max[2] = vol_score
110109
cluster_index = index
111110
best_id = mol_ids[index]
112111

113-
cum_scores[0] += sucos_score
114-
cum_scores[1] += fm_score
115-
cum_scores[2] += vol_score
112+
scores_cum[0] += sucos_score
113+
scores_cum[1] += fm_score
114+
scores_cum[2] += vol_score
116115

117116
index += 1
118117

119-
if max_scores[0] > 0:
118+
# utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index)
119+
mol.SetDoubleProp(field_SuCOSMax_Score, scores_max[0] if scores_max[0] > 0 else 0)
120+
mol.SetDoubleProp(field_SuCOSMax_FMScore, scores_max[1] if scores_max[1] > 0 else 0)
121+
mol.SetDoubleProp(field_SuCOSMax_ProtrudeScore, scores_max[2] if scores_max[2] > 0 else 0)
120122

121-
# cluster_file_name_only = cluster_name.split(os.sep)[-1]
122-
#utils.log("Max SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2],"File:", cluster_file_name_only, "Index:", cluster_index)
123-
mol.SetDoubleProp(field_SuCOSMax_Score, max_scores[0])
124-
mol.SetDoubleProp(field_SuCOSMax_FMScore, max_scores[1])
125-
mol.SetDoubleProp(field_SuCOSMax_ProtrudeScore, max_scores[2])
123+
if best_id:
124+
mol.SetProp(field_SuCOSMax_Target, best_id)
126125
mol.SetIntProp(field_SuCOSMax_Index, cluster_index)
127-
if best_id:
128-
mol.SetProp(field_SuCOSMax_Target, best_id)
129126

130-
if cum_scores[0] > 0:
131-
#utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2])
132-
mol.SetDoubleProp(field_SuCOSCum_Score, cum_scores[0])
133-
mol.SetDoubleProp(field_SuCOSCum_FMScore, cum_scores[1])
134-
mol.SetDoubleProp(field_SuCOSCum_ProtrudeScore, cum_scores[2])
127+
# utils.log("Cum SuCOS:", scores[0], "FM:", scores[1], "P:", scores[2])
128+
mol.SetDoubleProp(field_SuCOSCum_Score, scores_cum[0] if scores_cum[0] > 0 else 0)
129+
mol.SetDoubleProp(field_SuCOSCum_FMScore, scores_cum[1] if scores_cum[1] > 0 else 0)
130+
mol.SetDoubleProp(field_SuCOSCum_ProtrudeScore, scores_cum[2] if scores_cum[2] > 0 else 0)
135131

136-
writer.write(mol)
137132

133+
if filter_value and filter_field:
134+
if mol.HasProp(filter_field):
135+
val = mol.GetDoubleProp(filter_field)
136+
if val > filter_value:
137+
writer.write(mol)
138+
else:
139+
writer.write(mol)
138140

139141
utils.log("Completed", comparisons, "comparisons")
140142
return mol_num, comparisons, errors
@@ -148,12 +150,15 @@ def main():
148150
parser.add_argument('-tm', '--target-molecules', help='Target molecules to compare against')
149151
parser.add_argument('-tf', '--targets-format', help='Target molecules format')
150152
parser.add_argument('-n', '--name-field', help='Name of field with molecule name')
153+
parser.add_argument('--no-gzip', action='store_true', help='Do not compress the output (STDOUT is never compressed')
154+
parser.add_argument('--filter-value', type=float, help='Filter out values with scores less than this.')
155+
parser.add_argument('--filter-field', help='Field to use to filter values.')
151156

152157
args = parser.parse_args()
153158
utils.log("Max SuCOSMax Args: ", args)
154159

155160
source = "sucos_max.py"
156-
datasetMetaProps = {"source":source, "description": "SuCOSMax using RDKit " + rdBase.rdkitVersion}
161+
datasetMetaProps = {"source": source, "description": "SuCOSMax using RDKit " + rdBase.rdkitVersion}
157162
clsMappings = {}
158163
fieldMetaProps = []
159164

@@ -165,29 +170,37 @@ def main():
165170
clsMappings[field_SuCOSCum_FMScore] = "java.lang.Float"
166171
clsMappings[field_SuCOSCum_ProtrudeScore] = "java.lang.Float"
167172

168-
fieldMetaProps.append({"fieldName":field_SuCOSMax_Score, "values": {"source":source, "description":"SuCOS Max score"}})
169-
fieldMetaProps.append({"fieldName":field_SuCOSMax_FMScore, "values": {"source":source, "description":"SuCOS Max Feature Map score"}})
170-
fieldMetaProps.append({"fieldName":field_SuCOSMax_ProtrudeScore, "values": {"source":source, "description":"SuCOS Max Protrude score"}})
171-
fieldMetaProps.append({"fieldName":field_SuCOSMax_Index, "values": {"source":source, "description":"SuCOS Max target index"}})
172-
fieldMetaProps.append({"fieldName":field_SuCOSCum_Score, "values": {"source":source, "description":"SuCOS Cumulative score"}})
173-
fieldMetaProps.append({"fieldName":field_SuCOSCum_FMScore, "values": {"source":source, "description":"SuCOS Cumulative Feature Map score"}})
174-
fieldMetaProps.append({"fieldName":field_SuCOSCum_ProtrudeScore, "values": {"source":source, "description":"SuCOS Cumulative Protrude score"}})
173+
fieldMetaProps.append(
174+
{"fieldName": field_SuCOSMax_Score, "values": {"source": source, "description": "SuCOS Max score"}})
175+
fieldMetaProps.append({"fieldName": field_SuCOSMax_FMScore,
176+
"values": {"source": source, "description": "SuCOS Max Feature Map score"}})
177+
fieldMetaProps.append({"fieldName": field_SuCOSMax_ProtrudeScore,
178+
"values": {"source": source, "description": "SuCOS Max Protrude score"}})
179+
fieldMetaProps.append(
180+
{"fieldName": field_SuCOSMax_Index, "values": {"source": source, "description": "SuCOS Max target index"}})
181+
fieldMetaProps.append(
182+
{"fieldName": field_SuCOSCum_Score, "values": {"source": source, "description": "SuCOS Cumulative score"}})
183+
fieldMetaProps.append({"fieldName": field_SuCOSCum_FMScore,
184+
"values": {"source": source, "description": "SuCOS Cumulative Feature Map score"}})
185+
fieldMetaProps.append({"fieldName": field_SuCOSCum_ProtrudeScore,
186+
"values": {"source": source, "description": "SuCOS Cumulative Protrude score"}})
175187

176188
if args.name_field:
177189
clsMappings[field_SuCOSMax_Target] = "java.lang.String"
178-
fieldMetaProps.append({"fieldName":field_SuCOSMax_Target, "values": {"source":source, "description":"SuCOS Max target name"}})
179-
190+
fieldMetaProps.append(
191+
{"fieldName": field_SuCOSMax_Target, "values": {"source": source, "description": "SuCOS Max target name"}})
180192

181-
inputs_file,output,inputs_supplr,writer,output_base = rdkit_utils. \
182-
default_open_input_output(args.input, args.informat, args.output,
183-
'sucos-max', args.outformat,
184-
valueClassMappings=clsMappings,
185-
datasetMetaProps=datasetMetaProps,
186-
fieldMetaProps=fieldMetaProps)
193+
inputs_file, inputs_supplr = rdkit_utils.default_open_input(args.input, args.informat)
194+
output, writer, output_base = rdkit_utils.default_open_output(args.output,
195+
'sucos-max', args.outformat,
196+
valueClassMappings=clsMappings,
197+
datasetMetaProps=datasetMetaProps,
198+
fieldMetaProps=fieldMetaProps,
199+
compress=not args.no_gzip)
187200

188201
targets_file, targets_supplr = rdkit_utils.default_open_input(args.target_molecules, args.targets_format)
189202

190-
count, total, errors = process(inputs_supplr, targets_supplr, writer, args.name_field)
203+
count, total, errors = process(inputs_supplr, targets_supplr, writer, args.name_field, args.filter_value, args.filter_field)
191204

192205
inputs_file.close()
193206
targets_file.close()
@@ -196,8 +209,9 @@ def main():
196209
output.close()
197210

198211
if args.meta:
199-
utils.write_metrics(output_base, {'__InputCount__':count, '__OutputCount__':total, '__ErrorCount__':errors, 'RDKitSuCOS':total})
212+
utils.write_metrics(output_base, {'__InputCount__': count, '__OutputCount__': total, '__ErrorCount__': errors,
213+
'RDKitSuCOS': total})
200214

201215

202216
if __name__ == "__main__":
203-
main()
217+
main()

0 commit comments

Comments
 (0)