Skip to content

Commit b6d2385

Browse files
authored
Merge pull request #61 from EBI-Metagenomics/bugfix/downstream-should-use-decontaminated-contigs
The filtered assemblies used downstream should be QC filtered && decontaminated
2 parents 9b509e4 + f0886c4 commit b6d2385

File tree

7 files changed

+119
-68
lines changed

7 files changed

+119
-68
lines changed

conf/modules.config

Lines changed: 2 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -21,24 +21,11 @@ process {
2121
]
2222
}
2323

24-
/* Most of the pipeline only analyzes contigs >= this threshold, but BGC uses a threshold size. */
25-
withName: FILTER_ASSEMBLY {
24+
withName: INDEX_AND_PUBLISH_CONTIGS {
2625
publishDir = [
2726
path: { "${params.outdir}/${meta.id}/qc" },
2827
mode: params.publish_dir_mode,
29-
pattern: "*_filtered.fasta.gz*",
30-
saveAs: { filename -> {
31-
if (filename.contains('fasta.gz.gzi')) {
32-
return "${meta.id}_filtered_contigs.fasta.gz.gzi"
33-
}
34-
if (filename.contains("fasta.gz.fai")) {
35-
return "${meta.id}_filtered_contigs.fasta.gz.fai"
36-
}
37-
if (filename.contains("fasta.gz")) {
38-
return "${meta.id}_filtered_contigs.fasta.gz"
39-
}
40-
}
41-
},
28+
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
4229
]
4330
}
4431

docs/output.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ The `qc` directory contains output files related to the quality control steps of
4141

4242
#### Output files
4343

44-
- **ERZ12345_filtered_contigs.fasta.gz**: This `FASTA` file contains the filtered contigs after the removal of those that are shorter than 500 bases, and which have a proportion of ambiguous bases higher than 10%.
44+
- **ERZ12345_filtered_contigs.fasta.gz**: This `FASTA` file contains the final processed contigs. Contigs shorter than 500 bases and those with a proportion of ambiguous bases higher than 10% are removed first. If decontamination is enabled, contigs aligning to the PhiX, human, or host reference genomes are also removed. It is bgzip-compressed to allow random access via the accompanying index files.
4545
- **ERZ12345_filtered_contigs.fasta.gz.gzi**: This file is a compression index for the blockzip compressed filtered_contigs fasta file.
4646
- **ERZ12345_filtered_contigs.fasta.gz.fai**: This file is a FASTA index for the blockzip compressed filtered_contigs fasta file.
4747
- **ERZ12345_quast_stats.tsv.gz**: This compressed `tsv` file contains the QUAST summary output, giving an assessment of the quality of the contigs of this assembly.
@@ -337,10 +337,11 @@ ERZ56790,insufficient_contigs_after_n_content_filtering
337337

338338
#### The possible QC failed statuses are
339339

340-
| Status | Description |
341-
| ---------------------------------------------- | -------------------------------------------------------------------------- |
342-
| insufficient_contigs_after_length_filtering | No contigs remained after applying the minimum length filter. |
343-
| insufficient_contigs_after_n_content_filtering | No contigs remained after filtering out sequences with high N-base content |
340+
| Status | Description |
341+
| ---------------------------------------------- | ----------------------------------------------------------------------------------------- |
342+
| insufficient_contigs_after_length_filtering | No contigs remained after applying the minimum length filter. |
343+
| insufficient_contigs_after_n_content_filtering | No contigs remained after filtering out sequences with high N-base content. |
344+
| insufficient_contigs_after_decontamination | No contigs remained after decontamination against PhiX, human, or host reference genomes. |
344345

345346
### MultiQC
346347

modules/local/filter_assembly.nf

Lines changed: 6 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,16 @@ process FILTER_ASSEMBLY {
44

55
conda "${moduleDir}/environment.yml"
66
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
7-
'oras://community.wave.seqera.io/library/htslib_samtools_seqkit:049a7c2199a04854':
8-
'community.wave.seqera.io/library/htslib_samtools_seqkit:8d071a2f3053d830' }"
7+
'https://depot.galaxyproject.org/singularity/seqkit:2.9.0--h9ee0642_0':
8+
'biocontainers/seqkit:2.9.0--h9ee0642_0' }"
99

1010
input:
1111
tuple val(meta), path(assembly)
1212

1313
output:
14-
tuple val(meta), path("${prefix}_filtered.fasta.gz") , emit: fasta, optional: true
15-
tuple val(meta), path("${prefix}_filtered.fasta.gz.fai") , emit: fai, optional: true
16-
tuple val(meta), path("${prefix}_filtered.fasta.gz.gzi") , emit: gzi, optional: true
17-
tuple val(meta), env('EXIT_REASON') , emit: exit_reason, optional: true
18-
path "versions.yml" , emit: versions
14+
tuple val(meta), path("${prefix}_filtered.fasta.gz"), emit: fasta, optional: true
15+
tuple val(meta), env('EXIT_REASON') , emit: exit_reason, optional: true
16+
path "versions.yml" , emit: versions
1917

2018
when:
2119
task.ext.when == null || task.ext.when
@@ -44,21 +42,9 @@ process FILTER_ASSEMBLY {
4442
# Check if N-base filtering produced any sequences
4543
if [[ -s ${prefix}_nbases_filtered.tab2fx ]]; then
4644
echo "Some contigs remain after filtering..."
47-
echo "Converting to FASTA..."
4845
seqkit tab2fx ${prefix}_nbases_filtered.tab2fx \\
4946
--threads ${task.cpus} \\
50-
--out-file ${prefix}_filtered.fasta
51-
52-
# bgzip to enable .gzi block index
53-
echo "Compressing as bgzip..."
54-
bgzip -@ "${task.cpus}" ${prefix}_filtered.fasta
55-
56-
# using samtools as seqkit cannot index a .fasta.gz
57-
echo "Indexing compressed fasta..."
58-
samtools faidx ${prefix}_filtered.fasta.gz # -> _filtered.fasta.gz.fai
59-
60-
echo "Indexing compression archive..."
61-
bgzip -r ${prefix}_filtered.fasta.gz # -> _filtered.fasta.gz.gzi
47+
--out-file ${prefix}_filtered.fasta.gz
6248
else
6349
echo "No contigs after the N bases filtering"
6450
EXIT_REASON="insufficient_contigs_after_n_content_filtering"
@@ -68,21 +54,17 @@ process FILTER_ASSEMBLY {
6854
cat <<-END_VERSIONS > versions.yml
6955
"${task.process}":
7056
seqkit: \$(seqkit version | cut -d' ' -f2)
71-
samtools: \$(samtools --version-only)
7257
END_VERSIONS
7358
"""
7459

7560
stub:
7661
prefix = task.ext.prefix ?: "${meta.id}"
7762
"""
7863
touch ${prefix}_filtered.fasta.gz
79-
touch ${prefix}_filtered.fasta.gz.fai
80-
touch ${prefix}_filtered.fasta.gz.gzi
8164
8265
cat <<-END_VERSIONS > versions.yml
8366
"${task.process}":
8467
seqkit: \$(seqkit version | cut -d' ' -f2)
85-
samtools: \$(samtools --version-only)
8668
END_VERSIONS
8769
"""
8870
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
* This process sits at the end of the end of QC (length filter → N-content
3+
* filter → decontamination) and serves two purposes:
4+
*
5+
* 1. Final viability check — decontamination can remove all remaining contigs
6+
* even after upstream filters passed. If nothing is left the assembly is
7+
* failed here with exit reason "insufficient_contigs_after_decontamination".
8+
*
9+
* 2. bgzip re-compression and indexing — This process re-compresses as bgzip and
10+
* creates the .fai and .gzi indices required for random-access tools.
11+
*
12+
* It is the single publisher of the canonical _filtered_contigs.fasta.gz
13+
* output (which represents the most processed form of the assembly: filtered
14+
* and, when enabled, decontaminated). This is a stopgap until the pipeline
15+
* migrates to Nextflow workflow-level outputs.
16+
*/
17+
process INDEX_AND_PUBLISH_CONTIGS {
18+
19+
tag "$meta.id"
20+
21+
label 'process_single'
22+
23+
conda "${moduleDir}/environment.yml"
24+
container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
25+
'oras://community.wave.seqera.io/library/htslib_samtools_seqkit:049a7c2199a04854':
26+
'community.wave.seqera.io/library/htslib_samtools_seqkit:8d071a2f3053d830' }"
27+
28+
input:
29+
tuple val(meta), path(contigs)
30+
31+
output:
32+
tuple val(meta), path("${meta.id}_filtered_contigs.fasta.gz"), emit: filtered_contigs, optional: true
33+
path "${meta.id}_filtered_contigs.fasta.gz.fai" , optional: true
34+
path "${meta.id}_filtered_contigs.fasta.gz.gzi" , optional: true
35+
tuple val(meta), env('EXIT_REASON') , emit: exit_reason, optional: true
36+
path "versions.yml" , emit: versions
37+
38+
script:
39+
"""
40+
NUM_SEQS=\$(zcat ${contigs} | grep -c '^>')
41+
42+
if [[ \$NUM_SEQS -eq 0 ]]; then
43+
EXIT_REASON="insufficient_contigs_after_decontamination"
44+
else
45+
zcat ${contigs} | bgzip -@ ${task.cpus} > ${meta.id}_filtered_contigs.fasta.gz
46+
samtools faidx ${meta.id}_filtered_contigs.fasta.gz
47+
bgzip -r ${meta.id}_filtered_contigs.fasta.gz
48+
fi
49+
50+
cat <<-END_VERSIONS > versions.yml
51+
"${task.process}":
52+
samtools: \$(samtools --version-only)
53+
bgzip: \$(bgzip --version | head -1 | cut -d' ' -f3)
54+
END_VERSIONS
55+
"""
56+
57+
stub:
58+
"""
59+
touch ${meta.id}_filtered_contigs.fasta.gz
60+
touch ${meta.id}_filtered_contigs.fasta.gz.fai
61+
touch ${meta.id}_filtered_contigs.fasta.gz.gzi
62+
63+
cat <<-END_VERSIONS > versions.yml
64+
"${task.process}":
65+
samtools: \$(samtools --version-only)
66+
bgzip: \$(bgzip --version | head -1 | cut -d' ' -f3)
67+
END_VERSIONS
68+
"""
69+
}

subworkflows/local/assembly_qc.nf

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
/* LOCAL */
2-
include { FILTER_ASSEMBLY } from '../../modules/local/filter_assembly'
3-
include { ASSEMBLY_DECONTAMINATION } from '../ebi-metagenomics/assembly_decontamination/main'
2+
include { FILTER_ASSEMBLY } from '../../modules/local/filter_assembly'
3+
include { ASSEMBLY_DECONTAMINATION } from '../ebi-metagenomics/assembly_decontamination/main'
4+
include { INDEX_AND_PUBLISH_CONTIGS } from '../../modules/local/index_and_publish_contigs'
45

56
/* NF-CORE */
67
include { QUAST } from '../../modules/nf-core/quast/main'
@@ -12,7 +13,7 @@ workflow ASSEMBLY_QC {
1213

1314
main:
1415

15-
ch_versions = Channel.empty()
16+
ch_versions = channel.empty()
1617

1718
/*
1819
* Filter sequences based on specified criteria:
@@ -34,14 +35,22 @@ workflow ASSEMBLY_QC {
3435
)
3536
ch_versions = ch_versions.mix(ASSEMBLY_DECONTAMINATION.out.versions)
3637

38+
// Checks viability, re-compresses as bgzip, indexes, and publishes the final
39+
// contigs. Single ownership of _filtered_contigs.fasta.gz as a stopgap until
40+
// we migrate to the workflow-level outputs.
41+
INDEX_AND_PUBLISH_CONTIGS(
42+
ASSEMBLY_DECONTAMINATION.out.cleaned_contigs
43+
)
44+
ch_versions = ch_versions.mix(INDEX_AND_PUBLISH_CONTIGS.out.versions)
45+
3746
QUAST(
38-
ch_assembly.mix( ASSEMBLY_DECONTAMINATION.out.cleaned_contigs.ifEmpty([]) ).groupTuple()
47+
ch_assembly.mix( INDEX_AND_PUBLISH_CONTIGS.out.filtered_contigs.ifEmpty([]) ).groupTuple()
3948
)
4049
ch_versions = ch_versions.mix(QUAST.out.versions)
4150

4251
emit:
43-
assembly_qc_pass = ASSEMBLY_DECONTAMINATION.out.cleaned_contigs
44-
qc_failed_assemblies = FILTER_ASSEMBLY.out.exit_reason
52+
assembly_qc_pass = INDEX_AND_PUBLISH_CONTIGS.out.filtered_contigs
53+
qc_failed_assemblies = FILTER_ASSEMBLY.out.exit_reason.mix(INDEX_AND_PUBLISH_CONTIGS.out.exit_reason)
4554
quast_report_tsv = QUAST.out.tsv
4655
phix_contaminated_contigs_tsv = ASSEMBLY_DECONTAMINATION.out.phix_contaminated_contigs_tsv
4756
human_contaminated_contigs_tsv = ASSEMBLY_DECONTAMINATION.out.human_contaminated_contigs_tsv

tests/default.nf.test.snap

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"-profile test": {
33
"content": [
4-
155,
4+
157,
55
{
66
"ANTISMASH_ANTISMASH": {
77
"antismash": "8.0.1",
@@ -106,8 +106,7 @@
106106
"gawk": "5.3.0"
107107
},
108108
"FILTER_ASSEMBLY": {
109-
"seqkit": "v2.10.0",
110-
"samtools": "1.22.1+htslib-1.22.1"
109+
"seqkit": "v2.9.0"
111110
},
112111
"FILTER_IPS_AND_FAA_BY_CONTIGS": {
113112
"seqkit": "2.13.0",
@@ -133,6 +132,10 @@
133132
"hmmer": 3.4,
134133
"kofam version": 202503
135134
},
135+
"INDEX_AND_PUBLISH_CONTIGS": {
136+
"samtools": "1.22.1+htslib-1.22.1",
137+
"bgzip": "1.22.1"
138+
},
136139
"INFERNAL_CMSEARCH": {
137140
"cmsearch": "1.1.5",
138141
"Rfam version": 15
@@ -473,10 +476,10 @@
473476
"ERZ101_kegg_modules_summary.tsv.gz:md5,9d9859bfbbfac5b1708b6472f7eb74bb",
474477
"ERZ101_kegg_modules_summary.tsv.gz.gzi:md5,7dea362b3fac8e00956a4952a3d4f474",
475478
"ERZ101_sanntis.gff.gz:md5,df19e1b84ba6f691d20c72b397c88abf",
476-
"ERZ101_filtered_contigs.fasta.gz:md5,010fd0b578bdd9629e9a9b41e8b5bbd6",
477-
"ERZ101_filtered_contigs.fasta.gz.fai:md5,57034897e5827c1dba986c70145a1d63",
479+
"ERZ101_filtered_contigs.fasta.gz:md5,9cf21175bb2e4d5c65110facfb06ef14",
480+
"ERZ101_filtered_contigs.fasta.gz.fai:md5,b5f1254f9fa7bad5713d7c056db3d69c",
478481
"ERZ101_filtered_contigs.fasta.gz.gzi:md5,5737309e5668e4669c50a0a7f3c7ff0b",
479-
"ERZ101_quast_stats.tsv.gz:md5,f1c11d01769d2f12c567067c0f732864",
482+
"ERZ101_quast_stats.tsv.gz:md5,3581c09b4443c3b62e4fc910eedf0efe",
480483
"ERZ101_aligned_to_contaminant.tsv.gz:md5,bf678dade1ff3067918b61ce0d4225e6",
481484
"ERZ101_aligned_to_human.tsv.gz:md5,515849d5bcd7dfbdb518539785d47f50",
482485
"ERZ101.html:md5,e56d9f9358ef11fc76a5c4d95bcea909",
@@ -516,10 +519,10 @@
516519
"ERZ102_kegg_modules_summary.tsv.gz:md5,9d9859bfbbfac5b1708b6472f7eb74bb",
517520
"ERZ102_kegg_modules_summary.tsv.gz.gzi:md5,7dea362b3fac8e00956a4952a3d4f474",
518521
"ERZ102_sanntis.gff.gz:md5,df19e1b84ba6f691d20c72b397c88abf",
519-
"ERZ102_filtered_contigs.fasta.gz:md5,dcfc40de8e74f662979c8089436e2191",
520-
"ERZ102_filtered_contigs.fasta.gz.fai:md5,0fc4caf89637e0f0a36722a24c0e1317",
522+
"ERZ102_filtered_contigs.fasta.gz:md5,753656213b909342ae6dd1e8750c3775",
523+
"ERZ102_filtered_contigs.fasta.gz.fai:md5,6e6e178bbb564b8d292ba6bce4dd40ef",
521524
"ERZ102_filtered_contigs.fasta.gz.gzi:md5,03dd37017aa7366aa239bad2a4eac58e",
522-
"ERZ102_quast_stats.tsv.gz:md5,6dbcb902abac2a1435a1af29a87c8808",
525+
"ERZ102_quast_stats.tsv.gz:md5,c98abd37d7178cb885601535df368a49",
523526
"ERZ102_aligned_to_human.tsv.gz:md5,9654f1968ff591b11032e2447e422951",
524527
"ERZ102.html:md5,4bbfb6dff8ab8cb9b1f058b5a94eebdd",
525528
"ERZ102.krona.txt.gz:md5,ac2d11d2e91eeb5ca459caff14762973",
@@ -529,18 +532,18 @@
529532
"analysed_assemblies.csv:md5,333c3997aa4621cf3a644719e9021296",
530533
"samplesheet_dram.tsv.gz:md5,d9509874f9f6bbc2ab165106655fbce1",
531534
"multiqc_citations.txt:md5,2d1a8ef8ba06c7eada06ab3e96552ea4",
532-
"multiqc_general_stats.txt:md5,8b5ca1d9ddb8883eac63fffefd2ae6b8",
535+
"multiqc_general_stats.txt:md5,e9859d7e5ebcbcc222fec497deea93f1",
533536
"multiqc_host_decontamination_table.txt:md5,b8d06985826c0451c759e08382db2347",
534537
"multiqc_host_decontamination_table_table.txt:md5,b8d06985826c0451c759e08382db2347",
535538
"multiqc_human_decontamination_table.txt:md5,f8bc5f4b45b46d49826e5d8207a3b8f2",
536539
"multiqc_human_decontamination_table_table.txt:md5,f8bc5f4b45b46d49826e5d8207a3b8f2",
537-
"multiqc_quast.txt:md5,dc380d5fca13b91b7c21552ba8cd9f70",
538-
"quast_num_contigs.txt:md5,a078f48987914adbb249f0d8382f31f1",
539-
"quast_table.txt:md5,4dfa7d5c1a3715c24c29d1fe8c458319",
540+
"multiqc_quast.txt:md5,049b013e246ef77508966207d654e84b",
541+
"quast_num_contigs.txt:md5,b7c7360a303f1d50f3a1d50242e821a9",
542+
"quast_table.txt:md5,213c20ee21a3d9700463cdd53cd9b529",
540543
"qc_failed_assemblies.csv:md5,3292564aa6122920901d9bb724447ad4"
541544
]
542545
],
543-
"timestamp": "2026-03-09T09:26:09.063484904",
546+
"timestamp": "2026-03-09T20:21:49.498218773",
544547
"meta": {
545548
"nf-test": "0.9.4",
546549
"nextflow": "25.10.0"

tests/test_samplesheet.csv

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
sample,assembly_fasta,contaminant_reference,human_reference,phix_reference
2-
ERZ101,https://github.com/EBI-Metagenomics/assembly-analysis-pipeline/raw/refs/heads/feature/chunk-contigs-and-ips/tests/assembly_erz101.fasta.gz,contamination,human,
3-
ERZ102,https://github.com/EBI-Metagenomics/assembly-analysis-pipeline/raw/refs/heads/feature/chunk-contigs-and-ips/tests/assembly_erz102.fasta.gz,,,
4-
ERZ666,https://github.com/EBI-Metagenomics/assembly-analysis-pipeline/raw/refs/heads/feature/chunk-contigs-and-ips/tests/assembly_qc_fail_length_filtered.fasta.gz,,,
5-
ERZ999,https://github.com/EBI-Metagenomics/assembly-analysis-pipeline/raw/refs/heads/feature/chunk-contigs-and-ips/tests/assembly_qc_fail_n_bases.fasta.gz,,,
2+
ERZ101,https://github.com/EBI-Metagenomics/assembly-analysis-pipeline/raw/refs/heads/main/tests/assembly_erz101.fasta.gz,contamination,human,
3+
ERZ102,https://github.com/EBI-Metagenomics/assembly-analysis-pipeline/raw/refs/heads/main/tests/assembly_erz102.fasta.gz,,,
4+
ERZ666,https://github.com/EBI-Metagenomics/assembly-analysis-pipeline/raw/refs/heads/main/tests/assembly_qc_fail_length_filtered.fasta.gz,,,
5+
ERZ999,https://github.com/EBI-Metagenomics/assembly-analysis-pipeline/raw/refs/heads/main/tests/assembly_qc_fail_n_bases.fasta.gz,,,

0 commit comments

Comments
 (0)