rdkit standardize dsd

tdudgeon · tdudgeon · commit 1272a236dc0b · 2019-05-19T09:47:11.000+01:00
diff --git a/src/python/pipelines/rdkit/sanifier.py b/src/python/pipelines/rdkit/sanifier.py
@@ -49,7 +49,7 @@ def main():
 
     ### command line args defintions #########################################
 
-    parser = argparse.ArgumentParser(description='RDKit molecule standardiser / enumerator')
+    parser = argparse.ArgumentParser(description='RDKit molecule standardizer / enumerator')
     parameter_utils.add_default_io_args(parser)
     parser.add_argument('-et', '--enumerate_tauts', action='store_true', help='Enumerate all tautomers')
     parser.add_argument('-es', '--enumerate_stereo', action='store_true', help='Enumerate all stereoisomers')
@@ -62,10 +62,10 @@ def main():
     utils.log("Sanifier Args: ", args)
 
     if args.standardize and args.enumerate_tauts:
-        raise ValueError("Cannot Enumerate Tautomers and Standardise")
+        raise ValueError("Cannot Enumerate Tautomers and Standardize")
 
     if args.standardize and args.enumerate_stereo:
-        raise ValueError("Cannot Enumerate Stereo and Standardise")
+        raise ValueError("Cannot Enumerate Stereo and Standardize")
 
     if args.outformat == 'sdf' and args.mol_format == 'smiles':
         raise ValueError("Smiles cannot be used when outputting as SDF")
@@ -92,14 +92,14 @@ def main():
                     std.SetProp("uuid", oldUUID)
                 #utils.log("Standardized", i, inputCanSmiles, ">>", outputCanSmiles)
                 if inputCanSmiles == outputCanSmiles:
-                    std.SetProp("Standardised", "False")
+                    std.SetProp("Standardized", "False")
                 else:
-                    std.SetProp("Standardised", "True")
+                    std.SetProp("Standardized", "True")
             except:
                 errors += 1
                 utils.log("Error standardizing", sys.exc_info()[0])
                 std = mol
-                std.SetProp("Standardised", "Error")
+                std.SetProp("Standardized", "Error")
 
             count = write_out([std],count,writer,args.mol_format,args.outformat)
         else:
diff --git a/src/python/pipelines/rdkit/standardize.dsd.yml b/src/python/pipelines/rdkit/standardize.dsd.yml
@@ -0,0 +1,64 @@
+---
+"@class": org.squonk.core.DockerServiceDescriptor
+serviceConfig:
+  id: pipelines.rdkit.standardizer.v1
+  name: RDKitStandardizer
+  description: Standardize molecules
+  tags:
+  - rdkit
+  - docker
+  - standardise
+  - standardize
+  resourceUrl:
+  icon: icons/molecule_generator.png
+  inputDescriptors:
+  - primaryType: org.squonk.dataset.Dataset
+    secondaryType: org.squonk.types.MoleculeObject
+    mediaType: application/x-squonk-dataset-molecule+json
+    name: input
+  outputDescriptors:
+  - primaryType: org.squonk.dataset.Dataset
+    secondaryType: org.squonk.types.MoleculeObject
+    mediaType: application/x-squonk-dataset-molecule+json
+    name: output
+  optionDescriptors:
+  - modes:
+    - User
+    editable: true
+    "@class": org.squonk.options.OptionDescriptor
+    typeDescriptor:
+      type: java.lang.String
+      "@class": org.squonk.options.SimpleTypeDescriptor
+    key: arg.fragment_method
+    label: Fragment method
+    description: Approach to use for picking biggest molecular fragment
+    values:
+    - hac
+    - mw
+    defaultValue: hac
+    visible: true
+  - modes:
+    - User
+    editable: true
+    "@class": org.squonk.options.OptionDescriptor
+    typeDescriptor:
+      type: java.lang.Boolean
+      "@class": org.squonk.options.SimpleTypeDescriptor
+    key: arg.neutralize
+    label: Neutralize molecules
+    description: Convert charged groups to neutral form where possible
+    defaultValue: true
+    visible: true
+  executorClassName: org.squonk.execution.steps.impl.ThinDatasetDockerExecutorStep
+thinDescriptors:
+- input: input
+inputRoutes:
+- route: FILE
+outputRoutes:
+- route: FILE
+imageName: informaticsmatters/rdkit_pipelines
+command: >-
+  python -m pipelines.rdkit.standardize -i ${PIN}input.data.gz -if json -o ${POUT}output -of json
+  --fragment-method $fragment_method
+  ${neutralize ? '--neutralize' : ''}
+  --meta
diff --git a/src/python/pipelines/rdkit/standardize.py b/src/python/pipelines/rdkit/standardize.py
@@ -29,11 +29,11 @@
 uncharger = rdMolStandardize.Uncharger()
 
 
-def standardize(mol, neutralise, fragment):
+def standardize(mol, neutralize, fragment):
     """
 
     :param mol: The molecule to standardize
-    :param neutralise: Boolean for whether to neutralise the molecule
+    :param neutralize: Boolean for whether to neutralize the molecule
     :param fragment: The approach for choosing the largest fragment. Either 'hac' or 'mw'. If not specified the whole
     molecule is used.
     :return: The standardized molecule
@@ -43,7 +43,7 @@ def standardize(mol, neutralise, fragment):
     # We use our own largest fragment picker as the RDKit one behaves slightly differently
     if fragment:
         mol = mol_utils.fragment(mol, fragment)
-    if neutralise:
+    if neutralize:
         mol = uncharger.uncharge(mol)
     return mol
 
@@ -56,7 +56,7 @@ def main():
 
     parser = argparse.ArgumentParser(description='RDKit Standardize')
     parser.add_argument('--fragment-method', choices=['hac', 'mw'], help='Approach to find biggest fragment if more than one (hac = biggest by heavy atom count, mw = biggest by mol weight)')
-    parser.add_argument('--neutralise', action='store_true', help='Neutralise the molecule')
+    parser.add_argument('--neutralize', action='store_true', help='Neutralize the molecule')
 
     parameter_utils.add_default_io_args(parser)
     parser.add_argument('-q', '--quiet', action='store_true', help='Quiet mode')
@@ -78,13 +78,15 @@ def main():
                                   thinOutput=False, valueClassMappings=clsMappings,
                                   datasetMetaProps=datasetMetaProps,
                                   fieldMetaProps=fieldMetaProps)
-    i = 0
+    count = 0
     total = 0
+    errors = 0
     for mol in suppl:
+        count += 1
         if mol is None:
-            i += 1
+            errors += 1
             continue
-        m = standardize(mol, args.neutralise, args.fragment_method)
+        m = standardize(mol, args.neutralize, args.fragment_method)
         writer.write(m)
         total += 1
 
@@ -94,7 +96,7 @@ def main():
     output.close()
 
     if args.meta:
-        utils.write_metrics(output_base, {'__InputCount__':i, '__OutputCount__':total, 'RDKitStandardize':i})
+        utils.write_metrics(output_base, {'__InputCount__':count, '__OutputCount__':total, '__ErrorCount__':errors, 'RDKitStandardize':total})
 
 if __name__ == "__main__":
     main()
diff --git a/src/python/pipelines/rdkit/standardize.test b/src/python/pipelines/rdkit/standardize.test
@@ -0,0 +1,18 @@
+// Automated pipeline test specification.
+
+[
+
+    version = 1,
+
+    // A basic start-up test for the module.
+    // Simply makes sure it starts cleanly.
+    //
+    test_help = [
+
+        command: '''python -m pipelines.rdkit.standardize -h''',
+
+        stdout: [ 'usage: standardize.py' ]
+
+    ],
+
+]