Skip to content

Commit a067fa8

Browse files
9244 new subworkflow fastq preprocess (#9281)
* remove vscode settings json * restore original vscode json * fix issues --------- Co-authored-by: Evangelos Karatzas <[email protected]>
1 parent 99fe1cc commit a067fa8

File tree

6 files changed

+599
-0
lines changed

6 files changed

+599
-0
lines changed
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
include { FASTQ_SANITISE_SEQKIT } from '../fastq_sanitise_seqkit/main'
2+
include { SEQKIT_SEQ } from '../../../modules/nf-core/seqkit/seq/main'
3+
include { SEQKIT_REPLACE } from '../../../modules/nf-core/seqkit/replace/main'
4+
include { SEQKIT_RMDUP } from '../../../modules/nf-core/seqkit/rmdup/main'
5+
6+
workflow FASTQ_PREPROCESS {
7+
8+
take:
9+
ch_reads // channel: [ val(meta), [ fastq ] ]
10+
skip_seqkit_sana_pair // boolean
11+
skip_seqkit_seq // boolean
12+
skip_seqkit_replace // boolean
13+
skip_seqkit_rmdup // boolean
14+
15+
main:
16+
ch_versions = Channel.empty()
17+
18+
if (!skip_seqkit_sana_pair) {
19+
FASTQ_SANITISE_SEQKIT( ch_reads )
20+
ch_reads = FASTQ_SANITISE_SEQKIT.out.reads
21+
ch_versions = ch_versions.mix(FASTQ_SANITISE_SEQKIT.out.versions.first())
22+
}
23+
24+
// Split paired-end reads and add strandedness to meta
25+
ch_reads_split = ch_reads
26+
.flatMap { meta, reads ->
27+
if (meta.single_end) {
28+
if (reads instanceof List && reads.size() != 1) {
29+
error("Error: Check your meta.single_end value. Single-end reads should contain one file only.")
30+
}
31+
return [[ meta + [strandness: 'single'], reads ]]
32+
} else {
33+
if (!(reads instanceof List) || reads.size() != 2) {
34+
error("Error: Check your meta.single_end value. Paired-end data should have exactly 2 files.")
35+
}
36+
return [
37+
[ meta + [strandness: 'R1'], reads[0] ],
38+
[ meta + [strandness: 'R2'], reads[1] ]
39+
]
40+
}
41+
}
42+
43+
if (!skip_seqkit_seq) {
44+
SEQKIT_SEQ( ch_reads_split )
45+
ch_reads_split = SEQKIT_SEQ.out.fastx
46+
ch_versions = ch_versions.mix(SEQKIT_SEQ.out.versions.first())
47+
}
48+
49+
if (!skip_seqkit_replace) {
50+
SEQKIT_REPLACE( ch_reads_split )
51+
ch_reads_split = SEQKIT_REPLACE.out.fastx
52+
ch_versions = ch_versions.mix(SEQKIT_REPLACE.out.versions.first())
53+
}
54+
55+
if (!skip_seqkit_rmdup) {
56+
SEQKIT_RMDUP( ch_reads_split )
57+
ch_reads_split = SEQKIT_RMDUP.out.fastx
58+
ch_versions = ch_versions.mix(SEQKIT_RMDUP.out.versions.first())
59+
}
60+
61+
ch_reads = ch_reads_split
62+
.map { meta, fastq ->
63+
// Remove strandness field from meta to merge back together
64+
def clean_meta = meta.findAll { key, value -> key != 'strandness' }
65+
return [ clean_meta, fastq ]
66+
}
67+
.groupTuple(by: 0)
68+
.map { meta, files ->
69+
if (meta.single_end) {
70+
return [ meta, files[0] ]
71+
} else {
72+
def sorted_files = files.flatten().sort { it.name }
73+
return [ meta, sorted_files ]
74+
}
75+
}
76+
77+
emit:
78+
reads = ch_reads // channel: [ val(meta), [ fastq ] ]
79+
versions = ch_versions // channel: [ versions.yml ]
80+
81+
}
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json
2+
name: "fastq_preprocess"
3+
description: Subworkflow that preprocesses FASTQ files
4+
keywords:
5+
- fasta
6+
- seqkit
7+
- preprocessing
8+
components:
9+
- fastq_sanitise_seqkit
10+
- seqkit/sana
11+
- seqkit/pair
12+
- seqkit/seq
13+
- seqkit/replace
14+
- seqkit/rmdup
15+
input:
16+
- ch_reads:
17+
type: channel
18+
description: |
19+
Channel containing sample metadata and FASTQ files.
20+
Structure: [ val(meta), [ fastq ] ]
21+
Where meta is a map containing at least:
22+
- id: sample identifier
23+
- single_end: boolean indicating if data is single-end (true) or paired-end (false)
24+
pattern: "*.{fastq,fastq.gz,fq,fq.gz}"
25+
- skip_seqkit_sana_pair:
26+
type: boolean
27+
description: |
28+
If true, skips the seqkit_sana_pair subworkflow.
29+
- skip_seqkit_seq:
30+
type: boolean
31+
description: |
32+
If true, skips the seqkit_seq process.
33+
- skip_seqkit_replace:
34+
type: boolean
35+
description: |
36+
If true, skips the seqkit_replace process.
37+
- skip_seqkit_rmdup:
38+
type: boolean
39+
description: |
40+
If true, skips the seqkit_rmdup process.
41+
output:
42+
- reads:
43+
type: channel
44+
description: |
45+
Channel containing filtered FASTQ files.
46+
Structure: [ val(meta), [ fastq ] ]
47+
pattern: "*.{fastq,fastq.gz,fq,fq.gz}"
48+
- versions:
49+
type: file
50+
description: |
51+
File containing software versions
52+
Structure: [ path(versions.yml) ]
53+
pattern: "versions.yml"
54+
authors:
55+
- "@maia-munteanu"
56+
maintainers:
57+
- "@maia-munteanu"
58+
- "@vagkaratzas"
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
// IMPORTANT: This config file should be included to ensure that the subworkflow works properly.
2+
process {
3+
4+
withName: SEQKIT_SANA {
5+
ext.prefix = { "${meta.id}_${meta.strandness}" }
6+
}
7+
8+
}
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
nextflow_workflow {
2+
3+
name "Test Subworkflow FASTQ_PREPROCESS"
4+
script "../main.nf"
5+
workflow "FASTQ_PREPROCESS"
6+
config './nextflow.config'
7+
8+
tag "subworkflows"
9+
tag "subworkflows_nfcore"
10+
tag "subworkflows/fastq_sanitise_seqkit"
11+
tag "subworkflows/fastq_preprocess"
12+
tag "seqkit"
13+
tag "seqkit/sana"
14+
tag "seqkit/pair"
15+
tag "seqkit/seq"
16+
tag "seqkit/replace"
17+
tag "seqkit/rmdup"
18+
19+
test("sarscov2 - fastq - single_end") {
20+
21+
when {
22+
workflow {
23+
"""
24+
input[0] = Channel.of([
25+
[ id:'test_single', single_end:true ], // meta map
26+
file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true)
27+
])
28+
input[1] = false // skip_seqkit_sana_pair
29+
input[2] = false // skip_seqkit_seq
30+
input[3] = false // skip_seqkit_replace
31+
input[4] = false // skip_seqkit_rmdup
32+
"""
33+
}
34+
}
35+
36+
then {
37+
assertAll(
38+
{ assert workflow.success },
39+
{ assert snapshot(
40+
workflow.out,
41+
workflow.out.versions.collect{ path(it).yaml }.unique()
42+
).match() }
43+
)
44+
}
45+
}
46+
47+
test("sarscov2 - fastq - paired_end") {
48+
49+
when {
50+
workflow {
51+
"""
52+
input[0] = Channel.of([
53+
[ id:'test_paired', single_end:false ], // meta map
54+
[
55+
file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true),
56+
file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)
57+
]
58+
])
59+
input[1] = false // skip_seqkit_sana_pair
60+
input[2] = false // skip_seqkit_seq
61+
input[3] = false // skip_seqkit_replace
62+
input[4] = false // skip_seqkit_rmdup
63+
"""
64+
}
65+
}
66+
67+
then {
68+
assertAll(
69+
{ assert workflow.success },
70+
{ assert snapshot(
71+
workflow.out,
72+
workflow.out.versions.collect{ path(it).yaml }.unique()
73+
).match() }
74+
)
75+
}
76+
}
77+
78+
test("sarscov2 - fastq - both with single broken") {
79+
80+
when {
81+
workflow {
82+
"""
83+
input[0] = Channel.of(
84+
[
85+
[ id:'test_both', single_end:true ], // meta map
86+
file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1_broken.fastq.gz', checkIfExists: true)
87+
],
88+
[
89+
[ id:'test_both', single_end:false ], // meta map
90+
[
91+
file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true),
92+
file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true)
93+
]
94+
]
95+
)
96+
input[1] = false // skip_seqkit_sana_pair
97+
input[2] = false // skip_seqkit_seq
98+
input[3] = false // skip_seqkit_replace
99+
input[4] = false // skip_seqkit_rmdup
100+
"""
101+
}
102+
}
103+
104+
then {
105+
assertAll(
106+
{ assert workflow.success },
107+
{ assert snapshot(
108+
workflow.out,
109+
workflow.out.versions.collect{ path(it).yaml }.unique()
110+
).match() }
111+
)
112+
}
113+
}
114+
115+
test("sarscov2 - fastq - stub") {
116+
117+
options "-stub"
118+
119+
when {
120+
workflow {
121+
"""
122+
input[0] = Channel.of([
123+
[ id: 'test_stub', single_end:true ],
124+
file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1_broken.fastq.gz', checkIfExists: true)
125+
])
126+
input[1] = false // skip_seqkit_sana_pair
127+
input[2] = false // skip_seqkit_seq
128+
input[3] = false // skip_seqkit_replace
129+
input[4] = false // skip_seqkit_rmdup
130+
"""
131+
}
132+
}
133+
134+
then {
135+
assertAll(
136+
{ assert workflow.success },
137+
{ assert snapshot(
138+
workflow.out,
139+
workflow.out.versions.collect{ path(it).yaml }.unique()
140+
).match() }
141+
)
142+
}
143+
}
144+
}

0 commit comments

Comments
 (0)