Skip to content

Commit 915dcef

Browse files
committed
further improvements to charge and taut/stereo enumerators
1 parent ee89e17 commit 915dcef

File tree

4 files changed

+53
-20
lines changed

4 files changed

+53
-20
lines changed

src/python/pipelines/dimorphite/enumerate_charges.dsd.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,9 +66,9 @@ serviceConfig:
6666
- mw
6767
defaultValue: hac
6868
visible: true
69-
executorClassName: org.squonk.execution.steps.impl.ThinDatasetDockerExecutorStep
70-
thinDescriptors:
71-
- input: input
69+
executorClassName: org.squonk.execution.steps.impl.DefaultDockerExecutorStep
70+
#thinDescriptors:
71+
#- input: input
7272
inputRoutes:
7373
- route: FILE
7474
outputRoutes:

src/python/pipelines/dimorphite/enumerate_charges.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,9 @@ def writeEnumeratedMols(src_mol, enum_mols, writer, index):
4747

4848
def add_src_mol_ref(src_mol, target_mol, index):
4949
"""
50-
Add the ID of the source molecule to the enumerated molecule as the field named EnumChargeSrcMol.
51-
The ID is taken form the uuid field if it exists, if not form the _Name field if it exists and finally
52-
from the index parameter (the index of the source molecule in the input) if neither of those fields are found.
50+
Add the ID of the source molecule to the enumerated molecule as the field named EnumChargesSrcMolUUID.
51+
The ID is taken form the uuid field if it exists, if not form the _Name field if it exists.
52+
The EnumChargesSrcMolIdx field is always set with the index of the source molecule in the input..
5353
:param src_mol:
5454
:param target_mol:
5555
:param index:
@@ -59,11 +59,11 @@ def add_src_mol_ref(src_mol, target_mol, index):
5959
parent = src_mol.GetProp('uuid')
6060
elif src_mol.HasProp('_name_'):
6161
parent = src_mol.GetProp('_Name')
62-
else:
63-
parent = str(index)
6462

6563
if parent:
66-
target_mol.SetProp('EnumChargeSrcMol', parent)
64+
target_mol.SetProp('EnumChargesSrcMolUUID', parent)
65+
66+
target_mol.SetIntProp('EnumChargesSrcMolIdx', index)
6767

6868
### start main execution #########################################
6969

@@ -88,9 +88,12 @@ def main():
8888
source = "enumerate_charges.py"
8989
datasetMetaProps = {"source":source, "description": "Enumerate charges using Dimorphite-dl"}
9090
clsMappings = {
91-
"EnumChargeSrcMol": "java.lang.String"}
91+
"EnumChargesSrcMolUUID": "java.lang.String",
92+
"EnumChargesSrcMolIdx": "java.lang.Integer"
93+
}
9294
fieldMetaProps = [
93-
{"fieldName":"EnumChargeSrcMol", "values": {"source":source, "description":"ID of source molecule"}}
95+
{"fieldName":"EnumChargesSrcMolUUID", "values": {"source":source, "description":"UUID of source molecule"}},
96+
{"fieldName":"EnumChargesSrcMolIdx", "values": {"source":source, "description":"Index of source molecule"}}
9497
]
9598

9699
oformat = utils.determine_output_format(args.outformat)

src/python/pipelines/rdkit/sanifier.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env python
22

3-
# Copyright 2017 Informatics Matters Ltd.
3+
# Copyright 2019 Informatics Matters Ltd.
44
#
55
# Licensed under the Apache License, Version 2.0 (the "License");
66
# you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
1414
# See the License for the specific language governing permissions and
1515
# limitations under the License.
1616

17-
### Use MolVS to do tautomer enumeration, sterochemistry enumeration, charge neutralisation.
17+
### Use MolVS to do tautomer enumeration, stereochemistry enumeration, charge neutralisation.
1818

1919
import sys, argparse
2020

@@ -73,7 +73,28 @@ def main():
7373
if args.standardize:
7474
getStandardMolecule = STANDARD_MOL_METHODS[args.standardize_method]
7575

76-
input ,output ,suppl ,writer ,output_base = rdkit_utils.default_open_input_output(args.input, args.informat, args.output, 'sanify', args.outformat)
76+
# handle metadata
77+
source = "sanifier.py"
78+
datasetMetaProps = {"source":source, "description": "Enumerate tautomers and stereoisomers"}
79+
clsMappings = {
80+
"EnumTautIsoSourceMolUUID": "java.lang.String",
81+
"EnumTautIsoSourceMolIdx": "java.lang.Integer"
82+
}
83+
fieldMetaProps = [
84+
{"fieldName":"EnumTautIsoSourceMolUUID", "values": {"source":source, "description":"UUID of source molecule"}},
85+
{"fieldName":"EnumTautIsoSourceMolIdx", "values": {"source":source, "description":"Index of source molecule"}}
86+
]
87+
88+
oformat = utils.determine_output_format(args.outformat)
89+
90+
input,output,suppl,writer,output_base = rdkit_utils. \
91+
default_open_input_output(args.input, args.informat, args.output,
92+
'sanifier', args.outformat,
93+
thinOutput=False, valueClassMappings=clsMappings,
94+
datasetMetaProps=datasetMetaProps,
95+
fieldMetaProps=fieldMetaProps)
96+
97+
7798
i=0
7899
count=0
79100
errors=0
@@ -110,11 +131,13 @@ def main():
110131
parentUuid = None
111132

112133
results = []
113-
results.append(mol)
134+
114135

115136
if args.enumerate_tauts:
116137
utils.log("Enumerating tautomers")
117138
results = enumerateTautomers(mol)
139+
else:
140+
results.append(mol)
118141

119142
if args.enumerate_stereo:
120143
utils.log("Enumerating steroisomers")
@@ -125,10 +148,14 @@ def main():
125148
results.extend(enumerated)
126149

127150
for m in results:
151+
# copy the src mol props
152+
for name in mol.GetPropNames():
153+
m.SetProp(name, mol.GetProp(name))
154+
# add our new props
128155
m.ClearProp("uuid")
129-
m.SetIntProp("SourceMolNum", i)
156+
m.SetIntProp("EnumTautIsoSourceMolIdx", i)
130157
if parentUuid:
131-
m.SetProp("SourceMolUUID", parentUuid)
158+
m.SetProp("EnumTautIsoSourceMolUUID", parentUuid)
132159

133160
count = write_out(results,count,writer,args.mol_format,args.outformat)
134161

@@ -139,6 +166,11 @@ def main():
139166
input.close()
140167
output.close()
141168

169+
# re-write the metadata as we now know the size
170+
if oformat == 'json':
171+
utils.write_squonk_datasetmetadata(output_base, False, clsMappings, datasetMetaProps, fieldMetaProps, size=count)
172+
173+
142174
if args.meta:
143175
utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':count, '__ErrorCount__':errors , 'RDKitSanify':count })
144176

src/python/pipelines/rdkit/sanifier_enumerator.dsd.yml

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,7 @@ serviceConfig:
6969
minValues: 1
7070
maxValues: 1
7171
visible: true
72-
executorClassName: org.squonk.execution.steps.impl.ThinDatasetDockerExecutorStep
73-
thinDescriptors:
74-
- input: input
72+
executorClassName: org.squonk.execution.steps.impl.DefaultDockerExecutorStep
7573
inputRoutes:
7674
- route: FILE
7775
outputRoutes:

0 commit comments

Comments
 (0)