initial sampel workflow

tdudgeon · tdudgeon · commit 080fb404518d · 2025-09-11T15:15:34.000+01:00
diff --git a/concatenator.py b/concatenator.py
@@ -0,0 +1,78 @@
+import argparse
+import glob
+import shutil
+
+from dm_job_utilities.dm_log import DmLog
+
+
+def find_files(files_glob):
+    files = glob.glob(files_glob)
+    DmLog.emit_event("Found {} files using {}".format(len(files), files_glob))
+    return files
+
+
+def concat_binary(files_glob, output):
+    files = find_files(files_glob)
+    with (open(output, 'wb') as outfile):
+        file_count = 0
+        for file in files:
+            file_count += 1
+            with open(file,'rb') as infile:
+                shutil.copyfileobj(infile, outfile)
+
+        DmLog.emit_event("Wrote {} files".format(file_count))
+
+
+def concat_text(files_glob, header, output):
+    files = find_files(files_glob)
+    output_count = 0
+    with (open(output, 'w') as outfile):
+        file_count = 0
+        for file in files:
+            file_count += 1
+
+            with open(file) as infile:
+                line_count = 0
+                for line in infile:
+                    line_count += 1
+                    if header is None \
+                            or (header == 'ignore' and line_count > 1) \
+                            or (header == 'retain' and line_count == 1 and file_count == 1) \
+                            or (header == 'retain' and line_count > 1):
+                        outfile.write(line)
+                        output_count += 1
+
+    DmLog.emit_event("Wrote {} lines from {} files".format(output_count, file_count))
+
+
+def main():
+
+    # Examples:
+    #   python -m concatenator -f "*.sdf"
+    #   python -m concatenator -f "abcd*/output.sdf"
+    #   python -m concatenator -f "*.smi" --header ignore
+    #   python -m concatenator -f "*.bin" --binary
+    #
+    # NOTE: that if using globs for the files argument this must be escaped (e.g. abcd\*) or put in
+    # quotes (e.g. "abcd*") so that they are not expanded by the shell.
+    # NOTE: when using the --binary argument the --header argument is ignored.
+
+    # command line args definitions #########################################
+    parser = argparse.ArgumentParser(description='Concatenate files')
+    parser.add_argument('-f', '--files', required=True, help="Name(s) of files to look for (glob allowed)")
+    parser.add_argument('-o', '--output', required=True, help="Name(s) of output file")
+    parser.add_argument('--header', choices=["ignore", "retain"],
+                        help="Files have a header line, and what to do with it. If 'retain' the header of the first file is retained")
+    parser.add_argument('-b', '--binary', action='store_true', help='Treat files as having binary content')
+
+    args = parser.parse_args()
+    DmLog.emit_event("Concatenate files: ", args)
+
+    if args.binary:
+        concat_binary(args.files, args.output)
+    else:
+        concat_text(args.files, args.header, args.output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/data-manager/file-utils.yaml b/data-manager/file-utils.yaml
@@ -14,6 +14,7 @@ jobs:
     keywords:
     - sdf
     - splitter
+    - file
     image:
       name: informaticsmatters/vs-rdock
       tag: latest
@@ -36,12 +37,11 @@ jobs:
             title: SDFile to split
             mime-types:
             - chemical/x-mdl-sdfile
-            type: file
       outputs:
         type: object
         properties:
           outputFile:
-            title: Output files
+            title: Output file base name
             mime-types:
             - chemical/x-mdl-sdfile
             creates: '{{ outputFile }}_*.sdf'
@@ -74,3 +74,78 @@ jobs:
           - name: mychunk_00001.sdf
             checks:
             - exists: true
+
+  concatenator:
+    name: Concatenate files
+    description: >-
+      Takes a number of input files and concatenates them into a single output file
+    version: '1.0.0'
+    category: file utils
+    keywords:
+      - concatenate
+      - combine
+      - file
+    image:
+      name: informaticsmatters/vs-prep
+      tag: latest
+      project-directory: /data
+      working-directory: /data
+      fix-permissions: true
+    command: >-
+      python concatenator.py -f '{{ filesGlob }}' -d '{{ dirsGlob }}' {{ outputFile }}
+    variables:
+      order:
+        options:
+          - filesGlob
+          - outputFile
+          - header
+          - binary
+      outputs:
+        type: object
+        properties:
+          outputFile:
+            title: Output files
+            creates: '{{ outputFile }}'
+            type: files
+      options:
+        type: object
+        required:
+          - filesGlob
+          - outputFile
+        properties:
+          filesGlob:
+            title: Input files
+            type: string
+            pattern: "^[A-Za-z0-9_/\\.\\-\\?\\*]+$"
+          dirsGlob:
+            title: Dirs to search
+            type: string
+            pattern: "^[A-Za-z0-9_/\\.\\-\\?\\*]+$"
+          outputFile:
+            title: Output file name
+            type: string
+            pattern: "^[A-Za-z0-9_/\\.\\-]+$"
+          header:
+            title: Handle header line
+            type: string
+            enum:
+              - ignore
+              - retain
+          binary:
+            title: Treat as binary
+            type: boolean
+            default: false
+
+#    tests:
+#      simple-execution:
+#        options:
+#          outputFile: outfile.smi
+#          filesGlob: 10*.smi
+#          dirGlob: data
+#        checks:
+#          exitCode: 0
+#          outputs:
+#            - name: outfile.smi
+#              checks:
+#                - exists: true
+#                - lineCount: 111110
diff --git a/data-manager/workflow-docking.yaml b/data-manager/workflow-docking.yaml
@@ -0,0 +1,60 @@
+---
+kind: DataManagerWorkflow
+kind-version: "2025.2"
+name: rdock-docking-workflow
+description: >-
+  Workflow that performs docking using rDock
+
+steps:
+  - name: split
+    description: Split an input file
+    specification:
+      collection: file-utils
+      job: sdf-splitter
+      version: "1.0.0"
+      variables:
+        count: 100
+        outputFile: outputFile
+    plumbing:
+      - variable: inputFile
+        from-workflow:
+          variable: candidateMolecules
+      - variable: inputFile
+        from-project:
+
+  - name: enumerate
+    description: Enumerate candidate molecules
+    specification:
+      collection: im-virtual-screening
+      job: enumerate-candidates
+      version: "1.0.0"
+      variables:
+        maxHac: 25
+        outputFile: enumerated.sdf
+    plumbing:
+      - variable: inputFile
+        from-step:
+          name: split
+          variable: outputFile
+
+  # there would be a rDock step here,
+  # but we don't yet have this as a job (only part of Nextflow workflow)
+
+  - name: combine
+    description: Combine the calculated files
+    specification:
+      collection: file-utils
+      job: concatenator
+      version: "1.0.0"
+      variables:
+        filesGlob: .instance-*/calculated.sdf
+    plumbing:
+      - variable: outputFile
+        from-workflow:
+          variable: resultsFile
+      - variable: inputFile
+        from-step:
+          name: calculate
+          variable: outputFile
+      - variable: outputFile
+        to-project: