Skip to content

Commit 7577b08

Browse files
committed
sdf splitter job
1 parent e47acdc commit 7577b08

File tree

5 files changed

+128
-0
lines changed

5 files changed

+128
-0
lines changed

Dockerfile-rdock

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ RUN yum -y update &&\
1010
RUN pip install --trusted-host pypi.org --trusted-host pypi.python.org im-data-manager-job-utilities==1.2.0
1111

1212
COPY prepare_rdock.py rdock_*.sh /code/
13+
COPY scripts/split-sdf.sh /usr/local/bin/
1314
RUN chmod 755 /code/prepare_rdock.py
1415

1516
USER rdock
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
# Job: split-sdf
2+
3+
This describes how to run the `split-sdf` job from the `file utils` category in the `file-utils` collection.
4+
5+
## What the job does
6+
7+
This job reads a SD-file and splits it into chunks.
8+
9+
## Implementation details
10+
11+
This job is implemented as a bash script, using the sdsplit utility from rdock.
12+
13+
* Job definition: `jobs.split-sdf` in [file-utils.yaml](/data-manager/file-utils.yaml)
14+
15+
## How to run the job
16+
17+
### Inputs
18+
19+
* **SDFile to split**: the file to split into chunks
20+
21+
### Options
22+
* **Output file name**: the base name of the output files. e.g. specify `foo` and you get files
23+
line `foo_00001.sdf`, `foo_00002.sdf` etc.
24+
* **Chunk size**: the number of molecules in each output file.

data-manager/file-utils.yaml

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
---
2+
kind: DataManagerJobDefinition
3+
kind-version: '2021.1'
4+
name: File utility jobs
5+
collection: file-utils
6+
7+
jobs:
8+
sdf-splitter:
9+
name: Split SDF file
10+
description: >-
11+
Takes a SDF file and splits it into chunks of a specified number of records
12+
version: '1.0.0'
13+
category: file utils
14+
keywords:
15+
- sdf
16+
- splitter
17+
image:
18+
name: informaticsmatters/vs-rdock
19+
tag: latest
20+
project-directory: /data
21+
working-directory: /data
22+
fix-permissions: true
23+
command: >-
24+
split-sdf.sh {{ inputFile }} {{ count }} {{ outputFile }}
25+
variables:
26+
inputs:
27+
type: object
28+
required:
29+
- inputFile
30+
properties:
31+
inputFile:
32+
title: SDFile to split
33+
mime-types:
34+
- chemical/x-mdl-sdfile
35+
type: file
36+
outputs:
37+
type: object
38+
properties:
39+
outputFile:
40+
title: Output files
41+
mime-types:
42+
- chemical/x-mdl-sdfile
43+
creates: '{{ outputFile }}_*.sdf'
44+
type: files
45+
options:
46+
type: object
47+
required:
48+
- count
49+
properties:
50+
outputFile:
51+
title: Output file name
52+
type: string
53+
pattern: "^[A-Za-z0-9_/\\.\\-]+$"
54+
default: chunk
55+
count:
56+
title: Chunk size
57+
type: integer
58+
minimum: 1
59+
60+
tests:
61+
simple-execution:
62+
inputs:
63+
inputFile: data/candidates.sdf
64+
options:
65+
outputFile: mychunk
66+
count: 5
67+
checks:
68+
exitCode: 0
69+
outputs:
70+
- name: mychunk_00001.sdf
71+
checks:
72+
- exists: true
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
kind: DataManagerManifest
3+
kind-version: '2021.1'
4+
5+
job-definition-files:
6+
- file-utils.yaml

scripts/split-sdf.sh

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/bin/bash
2+
# arg1: file to split
3+
# arg2: chunk size
4+
# arg3: base filename for outputs
5+
6+
if [ ${1##*.} == 'gz' ]; then
7+
zcat $1 | sdsplit -$2 -omols_part_
8+
else
9+
sdsplit -$2 -omols_part_ $1
10+
fi
11+
12+
for f in mols_part_*.sd; do
13+
n=${f:10:-3}
14+
if [ ${#n} == 1 ]; then
15+
mv $f $3_0000${n}.sdf
16+
elif [ ${#n} == 2 ]; then
17+
mv $f $3_000${n}.sdf
18+
elif [ ${#n} == 3 ]; then
19+
mv $f $3_00${n}.sdf
20+
elif [ ${#n} == 4 ]; then
21+
mv $f $3_0${n}.sdf
22+
else
23+
mv $f $3_${n}.sdf
24+
fi
25+
done

0 commit comments

Comments
 (0)