Skip to content

Commit f556d6c

Browse files
Merge pull request #143 from EBI-Metagenomics/bbmap_reformat_standardise
Added bbmap_reformat_standardise from amplicon pipeline
2 parents e3407de + ee99097 commit f556d6c

File tree

10 files changed

+293
-0
lines changed

10 files changed

+293
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
.nextflow*
2+
.claude/
23
modules/nf-core
34
work/
45
results/
@@ -18,6 +19,7 @@ __pycache__
1819
node_modules/
1920
package.json
2021
package-lock.json
22+
CLAUDE.md
2123

2224
.venv/
2325

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
---
2+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
3+
channels:
4+
- conda-forge
5+
- bioconda
6+
dependencies:
7+
# TODO nf-core: List required Conda package(s).
8+
# Software MUST be pinned to channel (i.e. "bioconda"), version (i.e. "1.10").
9+
# For Conda, the build (i.e. "h9402c20_2") must be EXCLUDED to support installation on different operating systems.
10+
- "bioconda::bbmap=39.33"
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
BBTools Copyright (c) 2014, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved.
2+
3+
4+
5+
Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
6+
7+
8+
9+
(1) Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
10+
11+
12+
13+
(2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
14+
15+
16+
17+
(3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
18+
19+
20+
21+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
22+
23+
24+
25+
You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the features, functionality or performance of the source code ("Enhancements") to anyone; however, if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley National Laboratory, without imposing a separate written license agreement for such Enhancements, then you hereby grant the following license: a non-exclusive, royalty-free perpetual license to install, use, modify, prepare derivative works, incorporate into other computer software, distribute, and sublicense such enhancements or derivative works thereof, in binary and source code form.
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
process BBMAP_REFORMAT_STANDARDISE {
2+
tag "$meta.id"
3+
label 'process_single'
4+
5+
conda "${moduleDir}/environment.yml"
6+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
7+
'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/5a/5aae5977ff9de3e01ff962dc495bfa23f4304c676446b5fdf2de5c7edfa2dc4e/data' :
8+
'community.wave.seqera.io/library/bbmap_pigz:07416fe99b090fa9' }"
9+
10+
input:
11+
tuple val(meta), path(reads)
12+
val(out_fmt)
13+
14+
output:
15+
tuple val(meta), path("*_reformated.${out_fmt}") , emit: reformated
16+
tuple val(meta), path("${prefix}_singleton.${out_fmt}"), optional: true, emit: singleton
17+
path "versions.yml" , emit: versions
18+
path "*.log" , emit: log
19+
20+
when:
21+
task.ext.when == null || task.ext.when
22+
23+
script:
24+
def args = task.ext.args ?: ''
25+
prefix = task.ext.prefix ?: "${meta.id}"
26+
single_file = (reads instanceof Collection) ? (reads.size() == 1) : true
27+
in_reads = single_file ? "in=${reads[0]}" : "in=${reads[0]} in2=${reads[1]}"
28+
out_reads = meta.single_end ? "out=${prefix}_reformated.${out_fmt}" : "out=${prefix}_1_reformated.${out_fmt} out2=${prefix}_2_reformated.${out_fmt} outs=${prefix}_singleton.${out_fmt}"
29+
interleaved_cmd = meta.interleaved ? "int=t verifyinterleaved=t" : ""
30+
paired_cmd = meta.single_end ? "" : "addslash=t spaceslash=t verifypaired=t"
31+
32+
"""
33+
maxmem="${task.memory.toGiga()}G"
34+
# reformat.sh is from BBMap (https://sourceforge.net/projects/bbmap/)
35+
reformat.sh \\
36+
-Xmx\$maxmem \\
37+
$in_reads \\
38+
$out_reads \\
39+
$interleaved_cmd \\
40+
$paired_cmd \\
41+
threads=${task.cpus} \\
42+
${args} \\
43+
&> ${prefix}.reformat.sh.log
44+
45+
cat <<-END_VERSIONS > versions.yml
46+
"${task.process}":
47+
bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset")
48+
END_VERSIONS
49+
"""
50+
51+
stub:
52+
prefix = task.ext.prefix ?: "${meta.id}"
53+
"""
54+
echo "" | gzip > ${prefix}_1_reformated.${out_fmt}
55+
echo "" | gzip > ${prefix}_2_reformated.${out_fmt}
56+
echo "" | gzip > ${prefix}_singleton.${out_fmt}
57+
touch ${prefix}.repair.sh.log
58+
59+
cat <<-END_VERSIONS > versions.yml
60+
"${task.process}":
61+
bbmap: \$(bbversion.sh | grep -v "Duplicate cpuset")
62+
END_VERSIONS
63+
"""
64+
}
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json
2+
name: "bbmap_reformat_standardise"
3+
description: "De-interleave interleaved paired-end reads and standardize FASTQ format using BBMap's reformat.sh tool."
4+
keywords:
5+
- reformat
6+
- interleave
7+
- paired-end
8+
- fastq
9+
- standardise
10+
tools:
11+
- "bbmap":
12+
description: "BBMap is a fast and memory-efficient aligner and toolkit for various bioinformatics operations, including read reformatting and validation."
13+
homepage: "https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide"
14+
documentation: "https://jgi.doe.gov/data-and-tools/bbtools/bb-tools-user-guide"
15+
tool_dev_url: "https://sourceforge.net/projects/bbmap"
16+
doi: ""
17+
licence: ["see licence.txt"]
18+
identifier: null
19+
20+
input:
21+
- meta:
22+
type: map
23+
description: Metadata dictionary containing sample identifier, single_end flag, and interleaved flag
24+
pattern: "id|single_end|interleaved"
25+
- reads:
26+
type: file
27+
description: "Input FASTQ file(s). Can be single-end or paired-end (as list). Paired-end reads can be interleaved (single file) or de-interleaved (two files)."
28+
pattern: "*.{fastq,fq,fastq.gz,fq.gz}"
29+
- out_fmt:
30+
type: string
31+
description: "Output file format (e.g., 'fastq.gz', 'fasta.gz')"
32+
pattern: "^\\w+(\\.\\w+)*$"
33+
34+
output:
35+
- meta:
36+
type: map
37+
description: Metadata passed through from input
38+
- reformated:
39+
type: file
40+
description: "De-interleaved and standardized reads. For paired-end reads: *_1_reformated and *_2_reformated files. For single-end: *_reformated file."
41+
pattern: "*_reformated.{fastq,fq,fastq.gz,fq.gz,fasta,fa,fasta.gz,fa.gz}"
42+
ontologies:
43+
- edam: http://edamontology.org/format_1930 # FASTQ
44+
- edam: http://edamontology.org/format_1929 # FASTA
45+
- singleton:
46+
type: file
47+
description: "Optional singleton reads from paired-end processing (reads where mate is missing). Only emitted for paired-end inputs."
48+
pattern: "*_singleton.{fastq,fq,fastq.gz,fq.gz,fasta,fa,fasta.gz,fa.gz}"
49+
optional: true
50+
ontologies:
51+
- edam: http://edamontology.org/format_1930 # FASTQ
52+
- edam: http://edamontology.org/format_1929 # FASTA
53+
- log:
54+
type: file
55+
description: "BBMap reformat.sh log file"
56+
pattern: "*.log"
57+
- versions:
58+
type: file
59+
description: File containing software versions
60+
pattern: versions.yml
61+
ontologies:
62+
- edam: http://edamontology.org/format_3750 # YAML
63+
64+
authors:
65+
- "@timrozday-mgnify"
66+
maintainers:
67+
- "@timrozday-mgnify"
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
nextflow_process {
2+
3+
name "Test Process BBMAP_REFORMAT_STANDARDISE"
4+
script "../main.nf"
5+
process "BBMAP_REFORMAT_STANDARDISE"
6+
7+
tag "modules"
8+
tag "bbmap"
9+
tag "bbmap/reformat_standardise"
10+
11+
test("paired-end fastq") {
12+
13+
config "./nextflow.config"
14+
15+
when {
16+
process {
17+
"""
18+
input[0] = [
19+
[ id:'test', single_end:false, interleaved:false ],
20+
[
21+
file('${moduleDir}/tests/data/test-reads/ERR4334351_test_1.fastq.gz', checkIfExists: true),
22+
file('${moduleDir}/tests/data/test-reads/ERR4334351_test_2.fastq.gz', checkIfExists: true)
23+
]
24+
]
25+
input[1] = 'fastq.gz'
26+
"""
27+
}
28+
}
29+
30+
then {
31+
assertAll(
32+
{ assert process.success },
33+
{ assert snapshot(process.out.reformated, process.out.versions).match() }
34+
)
35+
}
36+
37+
}
38+
39+
test("paired-end fastq - stub") {
40+
41+
options "-stub"
42+
config "./nextflow.config"
43+
44+
when {
45+
process {
46+
"""
47+
input[0] = [
48+
[ id:'test', single_end:false, interleaved:false ],
49+
[
50+
file('${moduleDir}/tests/data/test-reads/ERR4334351_test_1.fastq.gz', checkIfExists: true),
51+
file('${moduleDir}/tests/data/test-reads/ERR4334351_test_2.fastq.gz', checkIfExists: true)
52+
]
53+
]
54+
input[1] = 'fastq.gz'
55+
"""
56+
}
57+
}
58+
59+
then {
60+
assertAll(
61+
{ assert process.success },
62+
{ assert snapshot(process.out.reformated, process.out.versions).match() }
63+
)
64+
}
65+
66+
}
67+
68+
}
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
{
2+
"paired-end fastq - stub": {
3+
"content": [
4+
[
5+
[
6+
{
7+
"id": "test",
8+
"single_end": false,
9+
"interleaved": false
10+
},
11+
[
12+
"test_1_reformated.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940",
13+
"test_2_reformated.fastq.gz:md5,68b329da9893e34099c7d8ad5cb9c940"
14+
]
15+
]
16+
],
17+
[
18+
"versions.yml:md5,0ab1d8143bd1544270934927271e7172"
19+
]
20+
],
21+
"meta": {
22+
"nf-test": "0.9.2",
23+
"nextflow": "25.10.0"
24+
},
25+
"timestamp": "2026-02-17T11:08:20.296962"
26+
},
27+
"paired-end fastq": {
28+
"content": [
29+
[
30+
[
31+
{
32+
"id": "test",
33+
"single_end": false,
34+
"interleaved": false
35+
},
36+
[
37+
"test_1_reformated.fastq.gz:md5,f17017afc5027ae04d55ce8138e67e6d",
38+
"test_2_reformated.fastq.gz:md5,f224591861b3b8a39e9232dea0caa20a"
39+
]
40+
]
41+
],
42+
[
43+
"versions.yml:md5,0ab1d8143bd1544270934927271e7172"
44+
]
45+
],
46+
"meta": {
47+
"nf-test": "0.9.2",
48+
"nextflow": "25.10.0"
49+
},
50+
"timestamp": "2026-02-17T11:08:16.558745"
51+
}
52+
}
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
process {
2+
withName: BBMAP_REFORMAT_STANDARDISE {
3+
ext.args = "allowidenticalnames=t trimreaddescription=t"
4+
}
5+
}

0 commit comments

Comments
 (0)