Skip to content

Commit 080fb40

Browse files
committed
initial sampel workflow
1 parent 5f5284b commit 080fb40

File tree

3 files changed

+215
-2
lines changed

3 files changed

+215
-2
lines changed

concatenator.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
import argparse
2+
import glob
3+
import shutil
4+
5+
from dm_job_utilities.dm_log import DmLog
6+
7+
8+
def find_files(files_glob):
9+
files = glob.glob(files_glob)
10+
DmLog.emit_event("Found {} files using {}".format(len(files), files_glob))
11+
return files
12+
13+
14+
def concat_binary(files_glob, output):
15+
files = find_files(files_glob)
16+
with (open(output, 'wb') as outfile):
17+
file_count = 0
18+
for file in files:
19+
file_count += 1
20+
with open(file,'rb') as infile:
21+
shutil.copyfileobj(infile, outfile)
22+
23+
DmLog.emit_event("Wrote {} files".format(file_count))
24+
25+
26+
def concat_text(files_glob, header, output):
27+
files = find_files(files_glob)
28+
output_count = 0
29+
with (open(output, 'w') as outfile):
30+
file_count = 0
31+
for file in files:
32+
file_count += 1
33+
34+
with open(file) as infile:
35+
line_count = 0
36+
for line in infile:
37+
line_count += 1
38+
if header is None \
39+
or (header == 'ignore' and line_count > 1) \
40+
or (header == 'retain' and line_count == 1 and file_count == 1) \
41+
or (header == 'retain' and line_count > 1):
42+
outfile.write(line)
43+
output_count += 1
44+
45+
DmLog.emit_event("Wrote {} lines from {} files".format(output_count, file_count))
46+
47+
48+
def main():
49+
50+
# Examples:
51+
# python -m concatenator -f "*.sdf"
52+
# python -m concatenator -f "abcd*/output.sdf"
53+
# python -m concatenator -f "*.smi" --header ignore
54+
# python -m concatenator -f "*.bin" --binary
55+
#
56+
# NOTE: that if using globs for the files argument this must be escaped (e.g. abcd\*) or put in
57+
# quotes (e.g. "abcd*") so that they are not expanded by the shell.
58+
# NOTE: when using the --binary argument the --header argument is ignored.
59+
60+
# command line args definitions #########################################
61+
parser = argparse.ArgumentParser(description='Concatenate files')
62+
parser.add_argument('-f', '--files', required=True, help="Name(s) of files to look for (glob allowed)")
63+
parser.add_argument('-o', '--output', required=True, help="Name(s) of output file")
64+
parser.add_argument('--header', choices=["ignore", "retain"],
65+
help="Files have a header line, and what to do with it. If 'retain' the header of the first file is retained")
66+
parser.add_argument('-b', '--binary', action='store_true', help='Treat files as having binary content')
67+
68+
args = parser.parse_args()
69+
DmLog.emit_event("Concatenate files: ", args)
70+
71+
if args.binary:
72+
concat_binary(args.files, args.output)
73+
else:
74+
concat_text(args.files, args.header, args.output)
75+
76+
77+
if __name__ == "__main__":
78+
main()

data-manager/file-utils.yaml

Lines changed: 77 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@ jobs:
1414
keywords:
1515
- sdf
1616
- splitter
17+
- file
1718
image:
1819
name: informaticsmatters/vs-rdock
1920
tag: latest
@@ -36,12 +37,11 @@ jobs:
3637
title: SDFile to split
3738
mime-types:
3839
- chemical/x-mdl-sdfile
39-
type: file
4040
outputs:
4141
type: object
4242
properties:
4343
outputFile:
44-
title: Output files
44+
title: Output file base name
4545
mime-types:
4646
- chemical/x-mdl-sdfile
4747
creates: '{{ outputFile }}_*.sdf'
@@ -74,3 +74,78 @@ jobs:
7474
- name: mychunk_00001.sdf
7575
checks:
7676
- exists: true
77+
78+
concatenator:
79+
name: Concatenate files
80+
description: >-
81+
Takes a number of input files and concatenates them into a single output file
82+
version: '1.0.0'
83+
category: file utils
84+
keywords:
85+
- concatenate
86+
- combine
87+
- file
88+
image:
89+
name: informaticsmatters/vs-prep
90+
tag: latest
91+
project-directory: /data
92+
working-directory: /data
93+
fix-permissions: true
94+
command: >-
95+
python concatenator.py -f '{{ filesGlob }}' -d '{{ dirsGlob }}' {{ outputFile }}
96+
variables:
97+
order:
98+
options:
99+
- filesGlob
100+
- outputFile
101+
- header
102+
- binary
103+
outputs:
104+
type: object
105+
properties:
106+
outputFile:
107+
title: Output files
108+
creates: '{{ outputFile }}'
109+
type: files
110+
options:
111+
type: object
112+
required:
113+
- filesGlob
114+
- outputFile
115+
properties:
116+
filesGlob:
117+
title: Input files
118+
type: string
119+
pattern: "^[A-Za-z0-9_/\\.\\-\\?\\*]+$"
120+
dirsGlob:
121+
title: Dirs to search
122+
type: string
123+
pattern: "^[A-Za-z0-9_/\\.\\-\\?\\*]+$"
124+
outputFile:
125+
title: Output file name
126+
type: string
127+
pattern: "^[A-Za-z0-9_/\\.\\-]+$"
128+
header:
129+
title: Handle header line
130+
type: string
131+
enum:
132+
- ignore
133+
- retain
134+
binary:
135+
title: Treat as binary
136+
type: boolean
137+
default: false
138+
139+
# tests:
140+
# simple-execution:
141+
# options:
142+
# outputFile: outfile.smi
143+
# filesGlob: 10*.smi
144+
# dirGlob: data
145+
# checks:
146+
# exitCode: 0
147+
# outputs:
148+
# - name: outfile.smi
149+
# checks:
150+
# - exists: true
151+
# - lineCount: 111110

data-manager/workflow-docking.yaml

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
---
2+
kind: DataManagerWorkflow
3+
kind-version: "2025.2"
4+
name: rdock-docking-workflow
5+
description: >-
6+
Workflow that performs docking using rDock
7+
8+
steps:
9+
- name: split
10+
description: Split an input file
11+
specification:
12+
collection: file-utils
13+
job: sdf-splitter
14+
version: "1.0.0"
15+
variables:
16+
count: 100
17+
outputFile: outputFile
18+
plumbing:
19+
- variable: inputFile
20+
from-workflow:
21+
variable: candidateMolecules
22+
- variable: inputFile
23+
from-project:
24+
25+
- name: enumerate
26+
description: Enumerate candidate molecules
27+
specification:
28+
collection: im-virtual-screening
29+
job: enumerate-candidates
30+
version: "1.0.0"
31+
variables:
32+
maxHac: 25
33+
outputFile: enumerated.sdf
34+
plumbing:
35+
- variable: inputFile
36+
from-step:
37+
name: split
38+
variable: outputFile
39+
40+
# there would be a rDock step here,
41+
# but we don't yet have this as a job (only part of Nextflow workflow)
42+
43+
- name: combine
44+
description: Combine the calculated files
45+
specification:
46+
collection: file-utils
47+
job: concatenator
48+
version: "1.0.0"
49+
variables:
50+
filesGlob: .instance-*/calculated.sdf
51+
plumbing:
52+
- variable: outputFile
53+
from-workflow:
54+
variable: resultsFile
55+
- variable: inputFile
56+
from-step:
57+
name: calculate
58+
variable: outputFile
59+
- variable: outputFile
60+
to-project:

0 commit comments

Comments
 (0)