Skip to content

Commit 1272a23

Browse files
committed
rdkit standardize dsd
1 parent e380b4f commit 1272a23

File tree

4 files changed

+98
-14
lines changed

4 files changed

+98
-14
lines changed

src/python/pipelines/rdkit/sanifier.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def main():
4949

5050
### command line args defintions #########################################
5151

52-
parser = argparse.ArgumentParser(description='RDKit molecule standardiser / enumerator')
52+
parser = argparse.ArgumentParser(description='RDKit molecule standardizer / enumerator')
5353
parameter_utils.add_default_io_args(parser)
5454
parser.add_argument('-et', '--enumerate_tauts', action='store_true', help='Enumerate all tautomers')
5555
parser.add_argument('-es', '--enumerate_stereo', action='store_true', help='Enumerate all stereoisomers')
@@ -62,10 +62,10 @@ def main():
6262
utils.log("Sanifier Args: ", args)
6363

6464
if args.standardize and args.enumerate_tauts:
65-
raise ValueError("Cannot Enumerate Tautomers and Standardise")
65+
raise ValueError("Cannot Enumerate Tautomers and Standardize")
6666

6767
if args.standardize and args.enumerate_stereo:
68-
raise ValueError("Cannot Enumerate Stereo and Standardise")
68+
raise ValueError("Cannot Enumerate Stereo and Standardize")
6969

7070
if args.outformat == 'sdf' and args.mol_format == 'smiles':
7171
raise ValueError("Smiles cannot be used when outputting as SDF")
@@ -92,14 +92,14 @@ def main():
9292
std.SetProp("uuid", oldUUID)
9393
#utils.log("Standardized", i, inputCanSmiles, ">>", outputCanSmiles)
9494
if inputCanSmiles == outputCanSmiles:
95-
std.SetProp("Standardised", "False")
95+
std.SetProp("Standardized", "False")
9696
else:
97-
std.SetProp("Standardised", "True")
97+
std.SetProp("Standardized", "True")
9898
except:
9999
errors += 1
100100
utils.log("Error standardizing", sys.exc_info()[0])
101101
std = mol
102-
std.SetProp("Standardised", "Error")
102+
std.SetProp("Standardized", "Error")
103103

104104
count = write_out([std],count,writer,args.mol_format,args.outformat)
105105
else:
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
---
2+
"@class": org.squonk.core.DockerServiceDescriptor
3+
serviceConfig:
4+
id: pipelines.rdkit.standardizer.v1
5+
name: RDKitStandardizer
6+
description: Standardize molecules
7+
tags:
8+
- rdkit
9+
- docker
10+
- standardise
11+
- standardize
12+
resourceUrl:
13+
icon: icons/molecule_generator.png
14+
inputDescriptors:
15+
- primaryType: org.squonk.dataset.Dataset
16+
secondaryType: org.squonk.types.MoleculeObject
17+
mediaType: application/x-squonk-dataset-molecule+json
18+
name: input
19+
outputDescriptors:
20+
- primaryType: org.squonk.dataset.Dataset
21+
secondaryType: org.squonk.types.MoleculeObject
22+
mediaType: application/x-squonk-dataset-molecule+json
23+
name: output
24+
optionDescriptors:
25+
- modes:
26+
- User
27+
editable: true
28+
"@class": org.squonk.options.OptionDescriptor
29+
typeDescriptor:
30+
type: java.lang.String
31+
"@class": org.squonk.options.SimpleTypeDescriptor
32+
key: arg.fragment_method
33+
label: Fragment method
34+
description: Approach to use for picking biggest molecular fragment
35+
values:
36+
- hac
37+
- mw
38+
defaultValue: hac
39+
visible: true
40+
- modes:
41+
- User
42+
editable: true
43+
"@class": org.squonk.options.OptionDescriptor
44+
typeDescriptor:
45+
type: java.lang.Boolean
46+
"@class": org.squonk.options.SimpleTypeDescriptor
47+
key: arg.neutralize
48+
label: Neutralize molecules
49+
description: Convert charged groups to neutral form where possible
50+
defaultValue: true
51+
visible: true
52+
executorClassName: org.squonk.execution.steps.impl.ThinDatasetDockerExecutorStep
53+
thinDescriptors:
54+
- input: input
55+
inputRoutes:
56+
- route: FILE
57+
outputRoutes:
58+
- route: FILE
59+
imageName: informaticsmatters/rdkit_pipelines
60+
command: >-
61+
python -m pipelines.rdkit.standardize -i ${PIN}input.data.gz -if json -o ${POUT}output -of json
62+
--fragment-method $fragment_method
63+
${neutralize ? '--neutralize' : ''}
64+
--meta

src/python/pipelines/rdkit/standardize.py

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@
2929
uncharger = rdMolStandardize.Uncharger()
3030

3131

32-
def standardize(mol, neutralise, fragment):
32+
def standardize(mol, neutralize, fragment):
3333
"""
3434
3535
:param mol: The molecule to standardize
36-
:param neutralise: Boolean for whether to neutralise the molecule
36+
:param neutralize: Boolean for whether to neutralize the molecule
3737
:param fragment: The approach for choosing the largest fragment. Either 'hac' or 'mw'. If not specified the whole
3838
molecule is used.
3939
:return: The standardized molecule
@@ -43,7 +43,7 @@ def standardize(mol, neutralise, fragment):
4343
# We use our own largest fragment picker as the RDKit one behaves slightly differently
4444
if fragment:
4545
mol = mol_utils.fragment(mol, fragment)
46-
if neutralise:
46+
if neutralize:
4747
mol = uncharger.uncharge(mol)
4848
return mol
4949

@@ -56,7 +56,7 @@ def main():
5656

5757
parser = argparse.ArgumentParser(description='RDKit Standardize')
5858
parser.add_argument('--fragment-method', choices=['hac', 'mw'], help='Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)')
59-
parser.add_argument('--neutralise', action='store_true', help='Neutralise the molecule')
59+
parser.add_argument('--neutralize', action='store_true', help='Neutralize the molecule')
6060

6161
parameter_utils.add_default_io_args(parser)
6262
parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode')
@@ -78,13 +78,15 @@ def main():
7878
thinOutput=False, valueClassMappings=clsMappings,
7979
datasetMetaProps=datasetMetaProps,
8080
fieldMetaProps=fieldMetaProps)
81-
i = 0
81+
count = 0
8282
total = 0
83+
errors = 0
8384
for mol in suppl:
85+
count += 1
8486
if mol is None:
85-
i += 1
87+
errors += 1
8688
continue
87-
m = standardize(mol, args.neutralise, args.fragment_method)
89+
m = standardize(mol, args.neutralize, args.fragment_method)
8890
writer.write(m)
8991
total += 1
9092

@@ -94,7 +96,7 @@ def main():
9496
output.close()
9597

9698
if args.meta:
97-
utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':total, 'RDKitStandardize':i})
99+
utils.write_metrics(output_base, {'__InputCount__':count, '__OutputCount__':total, '__ErrorCount__':errors, 'RDKitStandardize':total})
98100

99101
if __name__ == "__main__":
100102
main()
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
// Automated pipeline test specification.
2+
3+
[
4+
5+
version = 1,
6+
7+
// A basic start-up test for the module.
8+
// Simply makes sure it starts cleanly.
9+
//
10+
test_help = [
11+
12+
command: '''python -m pipelines.rdkit.standardize -h''',
13+
14+
stdout: [ 'usage: standardize.py' ]
15+
16+
],
17+
18+
]

0 commit comments

Comments
 (0)