Skip to content

Commit 4d6dcbd

Browse files
committed
Add deacon infrastructure for host read scrubbing
Introduces deacon modules, host depletion subworkflow, config params, Pydantic model fields, CLI options, JSON schema, and publishDir entry. Not yet wired into the preprocessing workflow.
1 parent d36cdc3 commit 4d6dcbd

File tree

10 files changed

+374
-16
lines changed

10 files changed

+374
-16
lines changed

.github/scripts/validate_schema_completeness.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@
8383
"state_dir",
8484
# Exposed via negated CLI flag --no-slack
8585
"slack_enabled",
86+
# Deacon tuning (set via params-file or preset)
87+
"deacon_kmer_size",
88+
"deacon_window_size",
89+
"deacon_abs_threshold",
90+
"deacon_rel_threshold",
8691
}
8792

8893

conf/results.config

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,15 @@ params {
4747

4848
// Assign the above paths to publish directories in processes throughout the pipeline
4949
process {
50+
withName: 'DEACON_DEPLETE' {
51+
publishDir = [
52+
path: { params.preprocess_results + "/00_host_depletion" },
53+
mode: 'copy',
54+
saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
55+
enabled: true
56+
]
57+
}
58+
5059
withName: 'EXTRACT_HUMAN_VIRUS_READS' {
5160
publishDir = [
5261
path: { params.human_virus_reads },

lib/py_nvd/cli/commands/run.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,24 @@ def run(
352352
help="Remove host reads with STAT (requires --sra-human-db; default: follows --preprocess)",
353353
rich_help_panel=PANEL_PREPROCESSING,
354354
),
355+
deacon_index: Path | None = typer.Option(
356+
None,
357+
"--deacon-index",
358+
help="Path to prebuilt deacon index (.idx file)",
359+
rich_help_panel=PANEL_PREPROCESSING,
360+
),
361+
deacon_index_url: str | None = typer.Option(
362+
None,
363+
"--deacon-index-url",
364+
help="URL to download prebuilt deacon index (default: panhuman-1)",
365+
rich_help_panel=PANEL_PREPROCESSING,
366+
),
367+
deacon_contaminants_fasta: Path | None = typer.Option(
368+
None,
369+
"--deacon-contaminants-fasta",
370+
help="Custom contaminant FASTA to union with base index",
371+
rich_help_panel=PANEL_PREPROCESSING,
372+
),
355373
filter_reads: bool | None = typer.Option(
356374
None,
357375
"--filter-reads/--no-filter-reads",
@@ -627,6 +645,9 @@ def run(
627645
"dedup": dedup,
628646
"trim_adapters": trim_adapters,
629647
"scrub_host_reads": scrub_host_reads,
648+
"deacon_index": deacon_index,
649+
"deacon_index_url": deacon_index_url,
650+
"deacon_contaminants_fasta": deacon_contaminants_fasta,
630651
"filter_reads": filter_reads,
631652
"min_read_quality_illumina": min_read_quality_illumina,
632653
"min_read_quality_nanopore": min_read_quality_nanopore,

lib/py_nvd/models.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -978,6 +978,43 @@ class NvdParams(BaseModel):
978978
json_schema_extra={"category": "Preprocessing"},
979979
)
980980

981+
# Host scrubbing with deacon
982+
deacon_index: Path | None = Field(
983+
None,
984+
description="Path to prebuilt deacon index (.idx file)",
985+
json_schema_extra={"category": "Preprocessing"},
986+
)
987+
deacon_index_url: str = Field(
988+
"https://zenodo.org/api/records/17288185/files/panhuman-1.k31w15.idx/content",
989+
description="URL to download prebuilt deacon index (default: panhuman-1)",
990+
json_schema_extra={"category": "Preprocessing"},
991+
)
992+
deacon_contaminants_fasta: Path | None = Field(
993+
None,
994+
description="Custom contaminant FASTA to union with base index",
995+
json_schema_extra={"category": "Preprocessing"},
996+
)
997+
deacon_kmer_size: int = Field(
998+
31,
999+
description="K-mer size for deacon index (must match index if prebuilt)",
1000+
json_schema_extra={"category": "Preprocessing"},
1001+
)
1002+
deacon_window_size: int = Field(
1003+
15,
1004+
description="Minimizer window size for deacon index",
1005+
json_schema_extra={"category": "Preprocessing"},
1006+
)
1007+
deacon_abs_threshold: int = Field(
1008+
2,
1009+
description="Minimum absolute minimizer hits to classify as contaminant",
1010+
json_schema_extra={"category": "Preprocessing"},
1011+
)
1012+
deacon_rel_threshold: float = Field(
1013+
0.01,
1014+
description="Minimum relative proportion of minimizers (0.0-1.0)",
1015+
json_schema_extra={"category": "Preprocessing"},
1016+
)
1017+
9811018
# =========================================================================
9821019
# Analysis Parameters
9831020
# =========================================================================

modules/deacon.nf

Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
/*
2+
* Deacon: Fast alignment-free decontamination
3+
* https://github.com/bede/deacon
4+
*
5+
* Key features:
6+
* - Preserves FASTQ headers (critical for read pairing)
7+
* - Composable indexes via set algebra (union, diff, intersect)
8+
* - SIMD-accelerated, ~5GB RAM for panhuman index
9+
*/
10+
11+
process DEACON_BUILD_INDEX {
12+
/*
13+
* Build a deacon index from FASTA file(s).
14+
* Use this for custom contaminant sequences.
15+
*/
16+
17+
tag "${fasta.simpleName}"
18+
label "medium"
19+
20+
input:
21+
path fasta
22+
23+
output:
24+
path "*.idx", emit: index
25+
26+
script:
27+
def prefix = fasta.simpleName
28+
"""
29+
deacon index build \\
30+
--threads ${task.cpus} \\
31+
-k ${params.deacon_kmer_size} \\
32+
-w ${params.deacon_window_size} \\
33+
${fasta} > ${prefix}.k${params.deacon_kmer_size}w${params.deacon_window_size}.idx
34+
"""
35+
}
36+
37+
process DEACON_FETCH_INDEX {
38+
/*
39+
* Download a prebuilt deacon index from URL.
40+
* Takes the URL as a channel value so the process only runs when
41+
* the input channel is non-empty (no `when:` guard needed).
42+
* Caches in work directory; use storeDir for persistent caching.
43+
*/
44+
45+
label "low"
46+
47+
input:
48+
val url
49+
50+
output:
51+
path "*.idx", emit: index
52+
53+
script:
54+
def filename = url.tokenize('/').last()
55+
"""
56+
curl -fsSL "${url}" -o ${filename}
57+
"""
58+
}
59+
60+
process DEACON_UNION_INDEXES {
61+
/*
62+
* Combine multiple deacon indexes via set union.
63+
* Only called when both a base index and custom index are present.
64+
*/
65+
66+
label "low"
67+
68+
input:
69+
path indexes // Collection of .idx files (always 2+)
70+
71+
output:
72+
path "combined.idx", emit: index
73+
74+
script:
75+
def idx_list = indexes.collect { it.name }.join(' ')
76+
"""
77+
deacon index union ${idx_list} > combined.idx
78+
"""
79+
}
80+
81+
process DEACON_DEPLETE {
82+
/*
83+
* Remove contaminant reads using deacon filter in deplete mode.
84+
*
85+
* Critical: This preserves FASTQ headers verbatim, which is required
86+
* for repair.sh to re-pair reads after filtering. SPAdes paired-end
87+
* assembly depends on proper read pairing.
88+
*
89+
* Deacon natively handles gzipped input/output (since v0.13.0).
90+
* When writing .gz output via --output, deacon splits --threads 1:1
91+
* between filtering and compression automatically.
92+
*/
93+
94+
tag "${sample_id}"
95+
label "medium"
96+
97+
errorStrategy { task.attempt < 3 ? 'retry' : 'ignore' }
98+
maxRetries 2
99+
100+
input:
101+
tuple val(sample_id), val(platform), val(read_structure), path(reads), path(index)
102+
103+
output:
104+
tuple val(sample_id), val(platform), val(read_structure), path("${sample_id}.depleted.fastq.gz"), emit: reads
105+
tuple val(sample_id), path("${sample_id}.deacon.json"), emit: stats
106+
107+
script:
108+
"""
109+
deacon filter \\
110+
--deplete \\
111+
--threads ${task.cpus} \\
112+
--abs-threshold ${params.deacon_abs_threshold} \\
113+
--rel-threshold ${params.deacon_rel_threshold} \\
114+
--summary ${sample_id}.deacon.json \\
115+
--output ${sample_id}.depleted.fastq.gz \\
116+
${index} \\
117+
${reads}
118+
"""
119+
}

nextflow.config

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,15 @@ params {
100100
min_read_length = 50
101101
max_read_length = null
102102

103+
// Host scrubbing with deacon (used when scrub_host_reads is enabled)
104+
deacon_index = null
105+
deacon_index_url = "https://zenodo.org/api/records/17288185/files/panhuman-1.k31w15.idx/content"
106+
deacon_contaminants_fasta = null
107+
deacon_kmer_size = 31
108+
deacon_window_size = 15
109+
deacon_abs_threshold = 2
110+
deacon_rel_threshold = 0.01
111+
103112
// NVD settings
104113
cutoff_percent = 0.001
105114
entropy = 0.9

0 commit comments

Comments
 (0)