Skip to content

Commit 15e8cde

Browse files
committed
vastly simplifying primer preparation with less heavy handed handling by nextflow, please yet more tidying around the codebase
1 parent 79807f0 commit 15e8cde

17 files changed

+1377
-422
lines changed

Containerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,5 +76,5 @@ COPY Cargo.lock $HOME/Cargo.lock
7676
COPY bin/find_and_trim_amplicons.rs $HOME/bin/find_and_trim_amplicons.rs
7777
RUN cd $HOME && \
7878
export PATH="$HOME/.pixi/envs/default/bin:$PATH" && \
79-
cargo build --release && \
79+
RUSTFLAGS="-C target-cpu=native" cargo build --release && \
8080
cp $HOME/target/release/find_and_trim_amplicons $HOME/.pixi/envs/default/bin/

bin/create_amplicon_tsv.py

Lines changed: 145 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -6,128 +6,200 @@
66
# ]
77
# ///
88

9-
"""Summarize amplicon coverage from stats files and BED file."""
9+
"""
10+
Summarize amplicon coverage from stats files and primer position data.
11+
12+
Supports two input modes for position data:
13+
1. --primer-tsv: Use primer_pairs.tsv from prepare_primers.py (preferred)
14+
2. --bed: Use BED file with primer coordinates (legacy)
15+
16+
The primer TSV approach is cleaner as it already contains pre-computed
17+
amplicon positions and works regardless of primer naming conventions.
18+
"""
1019

1120
from __future__ import annotations
1221

1322
import argparse
23+
import sys
24+
from pathlib import Path
1425

1526
import polars as pl
1627

1728

18-
def main(bed_file: str, output_file: str, stats_pattern: str) -> None:
29+
def _load_positions_from_tsv(tsv_path: str) -> pl.LazyFrame:
30+
"""
31+
Load amplicon positions from primer_pairs.tsv.
32+
33+
The TSV has columns: amplicon_name, fwd_sequence, rev_sequence, chrom,
34+
amplicon_start, amplicon_end
35+
36+
Returns a LazyFrame with: amplicon_name, start_pos, end_pos
37+
"""
38+
return pl.scan_csv(tsv_path, separator="\t").select(
39+
pl.col("amplicon_name"),
40+
pl.col("amplicon_start").alias("start_pos"),
41+
pl.col("amplicon_end").alias("end_pos"),
42+
)
43+
44+
45+
def _load_positions_from_bed(
46+
bed_path: str, fwd_suffix: str, rev_suffix: str
47+
) -> pl.LazyFrame:
1948
"""
20-
Create amplicon summary TSV from stats files and BED file.
49+
Load amplicon positions from BED file (legacy approach).
2150
22-
Reads all stats files matching the glob pattern, extracts sample and amplicon
23-
names from the 'file' column, joins with BED file to get amplicon positions,
24-
and writes a summary TSV.
51+
Extracts base amplicon name from primer names and computes positions
52+
from LEFT/RIGHT primer coordinates.
2553
26-
Parameters:
27-
bed_file: Path to BED file with primer positions
28-
output_file: Path to output TSV file
29-
stats_pattern: Glob pattern for stats files (e.g., "stats_*.tsv")
54+
Returns a LazyFrame with: amplicon_name, start_pos, end_pos
3055
"""
31-
# Read all stats files with glob, extract info from the 'file' column
32-
# (which contains filenames like "SAMPLE.QIAseq_X-Y.no_downsampling.fasta.gz")
33-
stats = (
56+
bed = (
3457
pl.scan_csv(
35-
stats_pattern,
58+
bed_path,
3659
separator="\t",
37-
glob=True,
60+
has_header=False,
61+
new_columns=["chrom", "start", "end", "name", "score", "strand"],
3862
)
3963
.with_columns(
40-
# Normalize .QIAseq_ to _QIAseq_ for consistent parsing
41-
pl.col("file")
42-
.str.replace(r"\.QIAseq_", "_QIAseq_")
43-
.alias("normalized_file")
64+
# Extract base amplicon name by removing suffixes and indices
65+
pl.col("name")
66+
.str.replace(fwd_suffix, "")
67+
.str.replace(rev_suffix, "")
68+
.str.replace(r"_splice\d+$", "")
69+
.str.replace(r"-\d+$", "")
70+
.alias("amplicon_name"),
71+
# Flag primer direction
72+
pl.col("name").str.contains(fwd_suffix).alias("is_fwd"),
73+
pl.col("name").str.contains(rev_suffix).alias("is_rev"),
4474
)
75+
.filter(pl.col("amplicon_name").is_not_null())
76+
)
77+
78+
# Aggregate to get amplicon span
79+
return bed.group_by("amplicon_name").agg(
80+
pl.col("start").filter(pl.col("is_fwd")).min().alias("start_pos"),
81+
pl.col("end").filter(pl.col("is_rev")).max().alias("end_pos"),
82+
)
83+
84+
85+
def _parse_stats_files(stats_pattern: str) -> pl.LazyFrame:
86+
"""
87+
Parse amplicon stats files and extract sample/amplicon info.
88+
89+
Stats files have columns including 'file' and 'num_seqs'.
90+
The 'file' column contains filenames like:
91+
"SAMPLE.amplicon_name.no_downsampling.fasta.gz"
92+
93+
Returns a LazyFrame with: sample_name, amplicon_name, reads
94+
"""
95+
return (
96+
pl.scan_csv(stats_pattern, separator="\t", glob=True)
4597
.with_columns(
46-
# Extract sample name: everything before _QIAseq_, or first dot-segment
47-
pl.when(pl.col("normalized_file").str.contains("_QIAseq_"))
48-
.then(
49-
pl.col("normalized_file").str.extract(
50-
r"^([^_]+(?:_[^_]+)*?)_QIAseq_", group_index=1
51-
)
52-
)
53-
.otherwise(
54-
pl.col("normalized_file").str.extract(r"^([^.]+)", group_index=1)
55-
)
56-
.alias("sample_name"),
57-
# Extract amplicon name: QIAseq_XXX (including any suffix like -1)
98+
# Extract sample name: first segment before '.'
99+
pl.col("file").str.extract(r"^([^.]+)", group_index=1).alias("sample_name"),
100+
# Extract amplicon name: second segment (between first and second '.')
58101
pl.col("file")
59-
.str.extract(r"(QIAseq_[^.]+)", group_index=1)
102+
.str.extract(r"^[^.]+\.([^.]+)", group_index=1)
60103
.alias("amplicon_name"),
61-
# Extract base amplicon for joining: QIAseq_N (just the number, no suffix)
62-
pl.col("file")
63-
.str.extract(r"(QIAseq_\d+)", group_index=1)
64-
.alias("base_amplicon"),
65104
)
66105
.select(
67106
"sample_name",
68107
"amplicon_name",
69-
"base_amplicon",
70108
pl.col("num_seqs").alias("reads"),
71109
)
72110
)
73111

74-
# Read BED file, compute amplicon start/end positions
75-
bed = (
76-
pl.scan_csv(
77-
bed_file,
78-
separator="\t",
79-
has_header=False,
80-
new_columns=["chrom", "start", "end", "name", "score", "strand"],
81-
)
82-
.with_columns(
83-
# Extract base amplicon from primer name (e.g., QIAseq_2_LEFT -> QIAseq_2)
84-
pl.col("name")
85-
.str.extract(r"(QIAseq_\d+)", group_index=1)
86-
.alias("base_amplicon"),
87-
# Flag LEFT vs RIGHT primers
88-
pl.col("name").str.contains("_LEFT").alias("is_left"),
89-
pl.col("name").str.contains("_RIGHT").alias("is_right"),
90-
)
91-
.filter(pl.col("base_amplicon").is_not_null())
92-
)
93-
94-
# Aggregate to get min(start) for LEFT primers, max(end) for RIGHT primers
95-
amplicon_positions = bed.group_by("base_amplicon").agg(
96-
pl.col("start").filter(pl.col("is_left")).min().alias("start_pos"),
97-
pl.col("end").filter(pl.col("is_right")).max().alias("end_pos"),
98-
)
99112

100-
# Join stats with positions and format output
113+
def _create_summary(
114+
stats_lf: pl.LazyFrame,
115+
positions_lf: pl.LazyFrame,
116+
output_path: str,
117+
) -> None:
118+
"""
119+
Join stats with positions and write summary TSV.
120+
"""
101121
result = (
102-
stats.join(amplicon_positions, on="base_amplicon", how="left")
122+
stats_lf.join(positions_lf, on="amplicon_name", how="left")
103123
.select(
104124
"sample_name",
105125
"amplicon_name",
106126
pl.col("start_pos").cast(pl.String).fill_null("NA"),
107127
pl.col("end_pos").cast(pl.String).fill_null("NA"),
108128
"reads",
109129
)
130+
.sort("sample_name", "amplicon_name")
110131
.collect()
111132
)
112133

113-
result.write_csv(output_file, separator="\t")
134+
result.write_csv(output_path, separator="\t")
135+
print(f"Wrote {len(result)} rows to {output_path}", file=sys.stderr)
114136

115137

116-
if __name__ == "__main__":
138+
def main() -> None:
117139
parser = argparse.ArgumentParser(
118-
description="Summarize amplicon coverage from stats files and BED file.",
140+
description=__doc__,
141+
formatter_class=argparse.RawDescriptionHelpFormatter,
119142
)
120-
parser.add_argument("--bed", required=True, help="Path to BED file")
121-
parser.add_argument(
122-
"--output",
123-
default="amplicon_summary.tsv",
124-
help="Output TSV file",
143+
144+
# Position data input (mutually exclusive)
145+
pos_group = parser.add_mutually_exclusive_group(required=True)
146+
pos_group.add_argument(
147+
"--primer-tsv",
148+
type=Path,
149+
help="Primer pairs TSV from prepare_primers.py (preferred)",
125150
)
151+
pos_group.add_argument(
152+
"--bed",
153+
type=Path,
154+
help="BED file with primer coordinates (legacy)",
155+
)
156+
157+
# Stats input
126158
parser.add_argument(
127159
"--pattern",
128160
default="stats_*.tsv",
129161
help="Glob pattern for stats files (default: stats_*.tsv)",
130162
)
131163

164+
# Output
165+
parser.add_argument(
166+
"--output",
167+
default="amplicon_summary.tsv",
168+
help="Output TSV file (default: amplicon_summary.tsv)",
169+
)
170+
171+
# BED-specific options
172+
parser.add_argument(
173+
"--fwd-suffix",
174+
default="_LEFT",
175+
help="Forward primer suffix for BED parsing (default: _LEFT)",
176+
)
177+
parser.add_argument(
178+
"--rev-suffix",
179+
default="_RIGHT",
180+
help="Reverse primer suffix for BED parsing (default: _RIGHT)",
181+
)
182+
132183
args = parser.parse_args()
133-
main(args.bed, args.output, args.pattern)
184+
185+
# Load position data
186+
if args.primer_tsv:
187+
assert args.primer_tsv.is_file(), f"Primer TSV not found: {args.primer_tsv}"
188+
positions_lf = _load_positions_from_tsv(str(args.primer_tsv))
189+
else:
190+
assert args.bed.is_file(), f"BED file not found: {args.bed}"
191+
positions_lf = _load_positions_from_bed(
192+
str(args.bed),
193+
args.fwd_suffix,
194+
args.rev_suffix,
195+
)
196+
197+
# Parse stats files
198+
stats_lf = _parse_stats_files(args.pattern)
199+
200+
# Create and write summary
201+
_create_summary(stats_lf, positions_lf, args.output)
202+
203+
204+
if __name__ == "__main__":
205+
main()

0 commit comments

Comments
 (0)