Skip to content

Commit ece0dc5

Browse files
committed
adding the capacity for decontamination of arbitrary reference datasets with deacon
1 parent 285a2c7 commit ece0dc5

File tree

12 files changed

+484
-274
lines changed

12 files changed

+484
-274
lines changed

.gitignore

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,14 @@
3535
!/conf/*.yml
3636

3737
# bin of executable scripts
38-
!/bin
38+
!/bin/
3939
!/bin/*.py
40+
!/bin/*.R
41+
!/bin/*.r
42+
!/bin/*.pl
43+
!/bin/*.lua
44+
!/bin/*.sh
45+
!/bin/*.awk
4046

4147
# groovy libraries
4248
!/lib

bin/make_primer_patterns.lua

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
#!/usr/bin/env lua
2+
3+
-- Usage:
4+
-- lua make_primer_patterns.lua -i input.fasta [-o output_prefix] [-f forward_pattern] [-r reverse_pattern]
5+
6+
local function parse_args()
7+
local opts = {
8+
input_fasta = nil,
9+
output_prefix = "primer_patterns",
10+
forward_pattern = "^(.*?)",
11+
reverse_pattern = "^(.*?)",
12+
}
13+
14+
local i = 1
15+
while i <= #arg do
16+
local a = arg[i]
17+
if a == "-i" or a == "--input_fasta" then
18+
i = i + 1
19+
opts.input_fasta = arg[i]
20+
elseif a == "-o" or a == "--output_prefix" then
21+
i = i + 1
22+
opts.output_prefix = arg[i]
23+
elseif a == "-f" or a == "--forward_pattern" then
24+
i = i + 1
25+
opts.forward_pattern = arg[i]
26+
elseif a == "-r" or a == "--reverse_pattern" then
27+
i = i + 1
28+
opts.reverse_pattern = arg[i]
29+
else
30+
error("Unknown argument: " .. a)
31+
end
32+
i = i + 1
33+
end
34+
35+
assert(opts.input_fasta, "You must provide --input_fasta (-i)")
36+
return opts
37+
end
38+
39+
local function read_fasta_lines(path)
40+
local file, err = io.open(path, "r")
41+
assert(file, "Could not open FASTA file: " .. (err or "unknown error"))
42+
local lines = {}
43+
for line in file:lines() do
44+
lines[#lines + 1] = line:gsub("%s+", "") -- strip whitespace
45+
end
46+
file:close()
47+
return lines
48+
end
49+
50+
local function generate_patterns(fasta_path, label, fwd_prefix, rev_suffix)
51+
local lines = read_fasta_lines(fasta_path)
52+
53+
-- Extract sequences and headers
54+
local seqs, headers, entries = {}, {}, {}
55+
local accumulator = nil
56+
for _, line in ipairs(lines) do
57+
if line:sub(1, 1) == ">" then
58+
if accumulator then
59+
entries[#entries + 1] = accumulator
60+
end
61+
accumulator = { header = line, seq = "" }
62+
else
63+
assert(accumulator)
64+
accumulator.seq = accumulator.seq .. line
65+
end
66+
end
67+
if accumulator then
68+
entries[#entries + 1] = accumulator
69+
end
70+
71+
-- Crash if the number of parsed sequences isn't exactly 2
72+
assert(#seqs == 2)
73+
74+
-- Parse start coordinates from header lines
75+
local starts = {}
76+
for _, header in ipairs(headers) do
77+
local start = header:match(":(%d+)%-%d+")
78+
assert(start, "Invalid header format (expected bedtools-style): " .. header)
79+
starts[#starts + 1] = tonumber(start)
80+
end
81+
82+
-- Heuristic check for orientation assumption
83+
if starts[1] > starts[2] then
84+
io.stderr:write(
85+
"⚠️ Warning: Please double check that the provided FASTA is formatted like an output from `bedtools getfasta`, e.g.\n\n'>PP599462.1:0-16'"
86+
)
87+
end
88+
89+
-- Build patterns
90+
local fwd_pattern = fwd_prefix .. seqs[1]
91+
local rev_pattern = seqs[2] .. rev_suffix
92+
93+
-- Write output
94+
local out, err = io.open(label .. ".txt", "w")
95+
assert(out, "Failed to write output: " .. (err or "unknown error"))
96+
out:write(fwd_pattern .. "\n")
97+
out:write(rev_pattern .. "\n")
98+
end
99+
100+
local function main()
101+
local opts = parse_args()
102+
103+
-- make sure the provided file exists
104+
local file = io.open(opts.input_fasta, "r")
105+
assert(file, "Input FASTA file does not exist: " .. opts.input_fasta)
106+
file:close()
107+
108+
-- generate the patterns and write them to a text file
109+
generate_patterns(opts.input_fasta, opts.output_prefix, opts.forward_pattern, opts.reverse_pattern)
110+
end
111+
112+
-- Run main
113+
if debug.getinfo(1, "S").short_src == arg[0] then
114+
main()
115+
end

bin/split_primer_combos.lua

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#!/usr/bin/env lua
2+
3+
-- Usage: lua split_primers.lua input.bed _LEFT _RIGHT
4+
-- Default suffixes if not provided
5+
local input_bed = arg[1] -- remember lua is 1-based!
6+
local forward_suffix = arg[2] or "_LEFT"
7+
local reverse_suffix = arg[3] or "_RIGHT"
8+
9+
-- Make sure the input bed file is provided
10+
assert(input_bed, "Usage: lua split_primers.lua <input.bed> [_LEFT] [_RIGHT]")
11+
12+
-- Initialize a table to store each primer
13+
local primers = {}
14+
15+
-- Read BED line by line
16+
local file = io.open(input_bed, "r")
17+
assert(file, "Failed to open input BED file.")
18+
19+
for line in file:lines() do
20+
local ref, start_pos, stop_pos, name, index, sense =
21+
line:match("([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)\t([^\t]+)")
22+
assert(name, "Malformed BED line: " .. line)
23+
24+
local base_name = name
25+
base_name = base_name:gsub(forward_suffix, "")
26+
base_name = base_name:gsub(reverse_suffix, "")
27+
28+
primers[base_name] = primers[base_name] or {}
29+
table.insert(primers[base_name], line)
30+
end
31+
32+
file:close()
33+
34+
-- Write each group to its own BED file
35+
for base_name, records in pairs(primers) do
36+
assert(#records == 2, "Expected 2 records for " .. base_name .. ", found " .. #records)
37+
local out = assert(io.open(base_name .. ".bed", "w"))
38+
for _, rec in ipairs(records) do
39+
out:write(rec .. "\n")
40+
end
41+
out:close()
42+
end

main.nf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ workflow {
6363
Channel.fromPath( params.ref_gbk ) :
6464
Channel.empty()
6565

66+
ch_contam_fasta = params.contam_fasta && file(params.contam_fasta).isFile()
67+
? Channel.fromPath( params.contam_fasta )
68+
: Channel.empty()
69+
6670
ch_snpeff_config = params.snpEff_config ?
6771
Channel.fromPath( params.snpEff_config ) :
6872
Channel.empty()
@@ -74,6 +78,7 @@ workflow {
7478
ch_primer_bed,
7579
ch_refseq,
7680
ch_ref_gbk,
81+
ch_contam_fasta,
7782
ch_snpeff_config,
7883
)
7984

modules/deacon.nf

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
process INDEX_CONTAMINANTS {
2+
3+
/* */
4+
5+
errorStrategy { task.attempt < 3 ? 'retry' : 'ignore' }
6+
maxRetries 2
7+
8+
cpus 1
9+
10+
input:
11+
path contam_fasta
12+
13+
output:
14+
path "*.idx"
15+
16+
script:
17+
def dbName = file(contam_fasta).getSimpleName()
18+
"""
19+
deacon index build ${contam_fasta} > ${dbName}.idx
20+
"""
21+
22+
}
23+
24+
process DECONTAMINATE {
25+
26+
/* */
27+
28+
tag "${barcode}"
29+
30+
errorStrategy { task.attempt < 3 ? 'retry' : 'ignore' }
31+
maxRetries 2
32+
33+
cpus 1
34+
35+
input:
36+
tuple val(barcode), path(reads), path(contam_index)
37+
38+
output:
39+
tuple val(barcode), path("${barcode}.decontam.fastq.gz")
40+
41+
script:
42+
def dbName = file(contam_index).getSimpleName()
43+
"""
44+
zcat ${reads} \
45+
| deacon filter ${contam_index} --log ${dbName}.log.json \
46+
| pigz > ${barcode}.decontam.fastq.gz
47+
"""
48+
49+
}

modules/fastqc.nf

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ process FASTQC {
1515
output:
1616
path "*.html", emit: html
1717
path "*.zip", emit: zip
18+
tuple val(barcode), path(reads), emit: fastq
1819

1920
script:
2021
"""
@@ -49,4 +50,4 @@ process FASTQC_RS {
4950
mv fastqc_data.txt ${label}/fastqc_data.txt
5051
"""
5152

52-
}
53+
}

nextflow.config

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,9 @@ params {
139139
// nextclade dataset
140140
nextclade_dataset = null
141141

142+
// Contamination FASTA dataset to scrub from reads
143+
contam_fasta = null
144+
142145
// devider haplotype phasing preset
143146
devider_preset = "nanopore-r10" // old-long-reads, nanopore-r9, nanopore-r10, hi-fi
144147

0 commit comments

Comments
 (0)