dholab
diff --git a/‎.github/scripts/validate_schema_completeness.py‎
Lines changed: 5 additions & 0 deletions b/‎.github/scripts/validate_schema_completeness.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎conf/results.config‎
Lines changed: 9 additions & 0 deletions b/‎conf/results.config‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎lib/py_nvd/cli/commands/run.py‎
Lines changed: 21 additions & 0 deletions b/‎lib/py_nvd/cli/commands/run.py‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎lib/py_nvd/models.py‎
Lines changed: 37 additions & 0 deletions b/‎lib/py_nvd/models.py‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎modules/deacon.nf‎
Lines changed: 119 additions & 0 deletions b/‎modules/deacon.nf‎
Lines changed: 119 additions & 0 deletions
diff --git a/‎nextflow.config‎
Lines changed: 9 additions & 0 deletions b/‎nextflow.config‎
Lines changed: 9 additions & 0 deletions
@@ -83,6 +83,11 @@
     "state_dir",
     # Exposed via negated CLI flag --no-slack
     "slack_enabled",
+    # Deacon tuning (set via params-file or preset)
+    "deacon_kmer_size",
+    "deacon_window_size",
+    "deacon_abs_threshold",
+    "deacon_rel_threshold",
 }
 
 
 
@@ -47,6 +47,15 @@ params {
 
 // Assign the above paths to publish directories in processes throughout the pipeline
 process {
+    withName: 'DEACON_DEPLETE' {
+        publishDir = [
+            path: { params.preprocess_results + "/00_host_depletion" },
+            mode: 'copy',
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+            enabled: true
+        ]
+    }
+
     withName: 'EXTRACT_HUMAN_VIRUS_READS' {
         publishDir = [
             path: { params.human_virus_reads },
 
@@ -352,6 +352,24 @@ def run(
         help="Remove host reads with STAT (requires --sra-human-db; default: follows --preprocess)",
         rich_help_panel=PANEL_PREPROCESSING,
     ),
+    deacon_index: Path | None = typer.Option(
+        None,
+        "--deacon-index",
+        help="Path to prebuilt deacon index (.idx file)",
+        rich_help_panel=PANEL_PREPROCESSING,
+    ),
+    deacon_index_url: str | None = typer.Option(
+        None,
+        "--deacon-index-url",
+        help="URL to download prebuilt deacon index (default: panhuman-1)",
+        rich_help_panel=PANEL_PREPROCESSING,
+    ),
+    deacon_contaminants_fasta: Path | None = typer.Option(
+        None,
+        "--deacon-contaminants-fasta",
+        help="Custom contaminant FASTA to union with base index",
+        rich_help_panel=PANEL_PREPROCESSING,
+    ),
     filter_reads: bool | None = typer.Option(
         None,
         "--filter-reads/--no-filter-reads",
@@ -627,6 +645,9 @@ def run(
         "dedup": dedup,
         "trim_adapters": trim_adapters,
         "scrub_host_reads": scrub_host_reads,
+        "deacon_index": deacon_index,
+        "deacon_index_url": deacon_index_url,
+        "deacon_contaminants_fasta": deacon_contaminants_fasta,
         "filter_reads": filter_reads,
         "min_read_quality_illumina": min_read_quality_illumina,
         "min_read_quality_nanopore": min_read_quality_nanopore,
 
@@ -978,6 +978,43 @@ class NvdParams(BaseModel):
         json_schema_extra={"category": "Preprocessing"},
     )
 
+    # Host scrubbing with deacon
+    deacon_index: Path | None = Field(
+        None,
+        description="Path to prebuilt deacon index (.idx file)",
+        json_schema_extra={"category": "Preprocessing"},
+    )
+    deacon_index_url: str = Field(
+        "https://zenodo.org/api/records/17288185/files/panhuman-1.k31w15.idx/content",
+        description="URL to download prebuilt deacon index (default: panhuman-1)",
+        json_schema_extra={"category": "Preprocessing"},
+    )
+    deacon_contaminants_fasta: Path | None = Field(
+        None,
+        description="Custom contaminant FASTA to union with base index",
+        json_schema_extra={"category": "Preprocessing"},
+    )
+    deacon_kmer_size: int = Field(
+        31,
+        description="K-mer size for deacon index (must match index if prebuilt)",
+        json_schema_extra={"category": "Preprocessing"},
+    )
+    deacon_window_size: int = Field(
+        15,
+        description="Minimizer window size for deacon index",
+        json_schema_extra={"category": "Preprocessing"},
+    )
+    deacon_abs_threshold: int = Field(
+        2,
+        description="Minimum absolute minimizer hits to classify as contaminant",
+        json_schema_extra={"category": "Preprocessing"},
+    )
+    deacon_rel_threshold: float = Field(
+        0.01,
+        description="Minimum relative proportion of minimizers (0.0-1.0)",
+        json_schema_extra={"category": "Preprocessing"},
+    )
+
     # =========================================================================
     # Analysis Parameters
     # =========================================================================
 
@@ -0,0 +1,119 @@
+/*
+ * Deacon: Fast alignment-free decontamination
+ * https://github.com/bede/deacon
+ *
+ * Key features:
+ * - Preserves FASTQ headers (critical for read pairing)
+ * - Composable indexes via set algebra (union, diff, intersect)
+ * - SIMD-accelerated, ~5GB RAM for panhuman index
+ */
+
+process DEACON_BUILD_INDEX {
+    /*
+     * Build a deacon index from FASTA file(s).
+     * Use this for custom contaminant sequences.
+     */
+
+    tag "${fasta.simpleName}"
+    label "medium"
+
+    input:
+    path fasta
+
+    output:
+    path "*.idx", emit: index
+
+    script:
+    def prefix = fasta.simpleName
+    """
+    deacon index build \\
+        --threads ${task.cpus} \\
+        -k ${params.deacon_kmer_size} \\
+        -w ${params.deacon_window_size} \\
+        ${fasta} > ${prefix}.k${params.deacon_kmer_size}w${params.deacon_window_size}.idx
+    """
+}
+
+process DEACON_FETCH_INDEX {
+    /*
+     * Download a prebuilt deacon index from URL.
+     * Takes the URL as a channel value so the process only runs when
+     * the input channel is non-empty (no `when:` guard needed).
+     * Caches in work directory; use storeDir for persistent caching.
+     */
+
+    label "low"
+
+    input:
+    val url
+
+    output:
+    path "*.idx", emit: index
+
+    script:
+    def filename = url.tokenize('/').last()
+    """
+    curl -fsSL "${url}" -o ${filename}
+    """
+}
+
+process DEACON_UNION_INDEXES {
+    /*
+     * Combine multiple deacon indexes via set union.
+     * Only called when both a base index and custom index are present.
+     */
+
+    label "low"
+
+    input:
+    path indexes  // Collection of .idx files (always 2+)
+
+    output:
+    path "combined.idx", emit: index
+
+    script:
+    def idx_list = indexes.collect { it.name }.join(' ')
+    """
+    deacon index union ${idx_list} > combined.idx
+    """
+}
+
+process DEACON_DEPLETE {
+    /*
+     * Remove contaminant reads using deacon filter in deplete mode.
+     *
+     * Critical: This preserves FASTQ headers verbatim, which is required
+     * for repair.sh to re-pair reads after filtering. SPAdes paired-end
+     * assembly depends on proper read pairing.
+     *
+     * Deacon natively handles gzipped input/output (since v0.13.0).
+     * When writing .gz output via --output, deacon splits --threads 1:1
+     * between filtering and compression automatically.
+     */
+
+    tag "${sample_id}"
+    label "medium"
+
+    errorStrategy { task.attempt < 3 ? 'retry' : 'ignore' }
+    maxRetries 2
+
+    input:
+    tuple val(sample_id), val(platform), val(read_structure), path(reads), path(index)
+
+    output:
+    tuple val(sample_id), val(platform), val(read_structure), path("${sample_id}.depleted.fastq.gz"), emit: reads
+    tuple val(sample_id), path("${sample_id}.deacon.json"), emit: stats
+
+    script:
+    """
+    deacon filter \\
+        --deplete \\
+        --threads ${task.cpus} \\
+        --abs-threshold ${params.deacon_abs_threshold} \\
+        --rel-threshold ${params.deacon_rel_threshold} \\
+        --summary ${sample_id}.deacon.json \\
+        --output ${sample_id}.depleted.fastq.gz \\
+        ${index} \\
+        ${reads}
+    """
+}
@@ -100,6 +100,15 @@ params {
     min_read_length           = 50
     max_read_length           = null
 
+    // Host scrubbing with deacon (used when scrub_host_reads is enabled)
+    deacon_index              = null
+    deacon_index_url          = "https://zenodo.org/api/records/17288185/files/panhuman-1.k31w15.idx/content"
+    deacon_contaminants_fasta = null
+    deacon_kmer_size          = 31
+    deacon_window_size        = 15
+    deacon_abs_threshold      = 2
+    deacon_rel_threshold      = 0.01
+
     // NVD settings
     cutoff_percent            = 0.001
     entropy                   = 0.9