|
6 | 6 | # ] |
7 | 7 | # /// |
8 | 8 |
|
9 | | -"""Summarize amplicon coverage from stats files and BED file.""" |
| 9 | +""" |
| 10 | +Summarize amplicon coverage from stats files and primer position data. |
| 11 | +
|
| 12 | +Supports two input modes for position data: |
| 13 | + 1. --primer-tsv: Use primer_pairs.tsv from prepare_primers.py (preferred) |
| 14 | + 2. --bed: Use BED file with primer coordinates (legacy) |
| 15 | +
|
| 16 | +The primer TSV approach is cleaner as it already contains pre-computed |
| 17 | +amplicon positions and works regardless of primer naming conventions. |
| 18 | +""" |
10 | 19 |
|
11 | 20 | from __future__ import annotations |
12 | 21 |
|
13 | 22 | import argparse |
| 23 | +import sys |
| 24 | +from pathlib import Path |
14 | 25 |
|
15 | 26 | import polars as pl |
16 | 27 |
|
17 | 28 |
|
18 | | -def main(bed_file: str, output_file: str, stats_pattern: str) -> None: |
| 29 | +def _load_positions_from_tsv(tsv_path: str) -> pl.LazyFrame: |
| 30 | + """ |
| 31 | + Load amplicon positions from primer_pairs.tsv. |
| 32 | +
|
| 33 | + The TSV has columns: amplicon_name, fwd_sequence, rev_sequence, chrom, |
| 34 | + amplicon_start, amplicon_end |
| 35 | +
|
| 36 | + Returns a LazyFrame with: amplicon_name, start_pos, end_pos |
| 37 | + """ |
| 38 | + return pl.scan_csv(tsv_path, separator="\t").select( |
| 39 | + pl.col("amplicon_name"), |
| 40 | + pl.col("amplicon_start").alias("start_pos"), |
| 41 | + pl.col("amplicon_end").alias("end_pos"), |
| 42 | + ) |
| 43 | + |
| 44 | + |
| 45 | +def _load_positions_from_bed( |
| 46 | + bed_path: str, fwd_suffix: str, rev_suffix: str |
| 47 | +) -> pl.LazyFrame: |
19 | 48 | """ |
20 | | - Create amplicon summary TSV from stats files and BED file. |
| 49 | + Load amplicon positions from BED file (legacy approach). |
21 | 50 |
|
22 | | - Reads all stats files matching the glob pattern, extracts sample and amplicon |
23 | | - names from the 'file' column, joins with BED file to get amplicon positions, |
24 | | - and writes a summary TSV. |
| 51 | + Extracts base amplicon name from primer names and computes positions |
| 52 | + from LEFT/RIGHT primer coordinates. |
25 | 53 |
|
26 | | - Parameters: |
27 | | - bed_file: Path to BED file with primer positions |
28 | | - output_file: Path to output TSV file |
29 | | - stats_pattern: Glob pattern for stats files (e.g., "stats_*.tsv") |
| 54 | + Returns a LazyFrame with: amplicon_name, start_pos, end_pos |
30 | 55 | """ |
31 | | - # Read all stats files with glob, extract info from the 'file' column |
32 | | - # (which contains filenames like "SAMPLE.QIAseq_X-Y.no_downsampling.fasta.gz") |
33 | | - stats = ( |
| 56 | + bed = ( |
34 | 57 | pl.scan_csv( |
35 | | - stats_pattern, |
| 58 | + bed_path, |
36 | 59 | separator="\t", |
37 | | - glob=True, |
| 60 | + has_header=False, |
| 61 | + new_columns=["chrom", "start", "end", "name", "score", "strand"], |
38 | 62 | ) |
39 | 63 | .with_columns( |
40 | | - # Normalize .QIAseq_ to _QIAseq_ for consistent parsing |
41 | | - pl.col("file") |
42 | | - .str.replace(r"\.QIAseq_", "_QIAseq_") |
43 | | - .alias("normalized_file") |
| 64 | + # Extract base amplicon name by removing suffixes and indices |
| 65 | + pl.col("name") |
| 66 | + .str.replace(fwd_suffix, "") |
| 67 | + .str.replace(rev_suffix, "") |
| 68 | + .str.replace(r"_splice\d+$", "") |
| 69 | + .str.replace(r"-\d+$", "") |
| 70 | + .alias("amplicon_name"), |
| 71 | + # Flag primer direction |
| 72 | + pl.col("name").str.contains(fwd_suffix).alias("is_fwd"), |
| 73 | + pl.col("name").str.contains(rev_suffix).alias("is_rev"), |
44 | 74 | ) |
| 75 | + .filter(pl.col("amplicon_name").is_not_null()) |
| 76 | + ) |
| 77 | + |
| 78 | + # Aggregate to get amplicon span |
| 79 | + return bed.group_by("amplicon_name").agg( |
| 80 | + pl.col("start").filter(pl.col("is_fwd")).min().alias("start_pos"), |
| 81 | + pl.col("end").filter(pl.col("is_rev")).max().alias("end_pos"), |
| 82 | + ) |
| 83 | + |
| 84 | + |
| 85 | +def _parse_stats_files(stats_pattern: str) -> pl.LazyFrame: |
| 86 | + """ |
| 87 | + Parse amplicon stats files and extract sample/amplicon info. |
| 88 | +
|
| 89 | + Stats files have columns including 'file' and 'num_seqs'. |
| 90 | + The 'file' column contains filenames like: |
| 91 | + "SAMPLE.amplicon_name.no_downsampling.fasta.gz" |
| 92 | +
|
| 93 | + Returns a LazyFrame with: sample_name, amplicon_name, reads |
| 94 | + """ |
| 95 | + return ( |
| 96 | + pl.scan_csv(stats_pattern, separator="\t", glob=True) |
45 | 97 | .with_columns( |
46 | | - # Extract sample name: everything before _QIAseq_, or first dot-segment |
47 | | - pl.when(pl.col("normalized_file").str.contains("_QIAseq_")) |
48 | | - .then( |
49 | | - pl.col("normalized_file").str.extract( |
50 | | - r"^([^_]+(?:_[^_]+)*?)_QIAseq_", group_index=1 |
51 | | - ) |
52 | | - ) |
53 | | - .otherwise( |
54 | | - pl.col("normalized_file").str.extract(r"^([^.]+)", group_index=1) |
55 | | - ) |
56 | | - .alias("sample_name"), |
57 | | - # Extract amplicon name: QIAseq_XXX (including any suffix like -1) |
| 98 | + # Extract sample name: first segment before '.' |
| 99 | + pl.col("file").str.extract(r"^([^.]+)", group_index=1).alias("sample_name"), |
| 100 | + # Extract amplicon name: second segment (between first and second '.') |
58 | 101 | pl.col("file") |
59 | | - .str.extract(r"(QIAseq_[^.]+)", group_index=1) |
| 102 | + .str.extract(r"^[^.]+\.([^.]+)", group_index=1) |
60 | 103 | .alias("amplicon_name"), |
61 | | - # Extract base amplicon for joining: QIAseq_N (just the number, no suffix) |
62 | | - pl.col("file") |
63 | | - .str.extract(r"(QIAseq_\d+)", group_index=1) |
64 | | - .alias("base_amplicon"), |
65 | 104 | ) |
66 | 105 | .select( |
67 | 106 | "sample_name", |
68 | 107 | "amplicon_name", |
69 | | - "base_amplicon", |
70 | 108 | pl.col("num_seqs").alias("reads"), |
71 | 109 | ) |
72 | 110 | ) |
73 | 111 |
|
74 | | - # Read BED file, compute amplicon start/end positions |
75 | | - bed = ( |
76 | | - pl.scan_csv( |
77 | | - bed_file, |
78 | | - separator="\t", |
79 | | - has_header=False, |
80 | | - new_columns=["chrom", "start", "end", "name", "score", "strand"], |
81 | | - ) |
82 | | - .with_columns( |
83 | | - # Extract base amplicon from primer name (e.g., QIAseq_2_LEFT -> QIAseq_2) |
84 | | - pl.col("name") |
85 | | - .str.extract(r"(QIAseq_\d+)", group_index=1) |
86 | | - .alias("base_amplicon"), |
87 | | - # Flag LEFT vs RIGHT primers |
88 | | - pl.col("name").str.contains("_LEFT").alias("is_left"), |
89 | | - pl.col("name").str.contains("_RIGHT").alias("is_right"), |
90 | | - ) |
91 | | - .filter(pl.col("base_amplicon").is_not_null()) |
92 | | - ) |
93 | | - |
94 | | - # Aggregate to get min(start) for LEFT primers, max(end) for RIGHT primers |
95 | | - amplicon_positions = bed.group_by("base_amplicon").agg( |
96 | | - pl.col("start").filter(pl.col("is_left")).min().alias("start_pos"), |
97 | | - pl.col("end").filter(pl.col("is_right")).max().alias("end_pos"), |
98 | | - ) |
99 | 112 |
|
100 | | - # Join stats with positions and format output |
| 113 | +def _create_summary( |
| 114 | + stats_lf: pl.LazyFrame, |
| 115 | + positions_lf: pl.LazyFrame, |
| 116 | + output_path: str, |
| 117 | +) -> None: |
| 118 | + """ |
| 119 | + Join stats with positions and write summary TSV. |
| 120 | + """ |
101 | 121 | result = ( |
102 | | - stats.join(amplicon_positions, on="base_amplicon", how="left") |
| 122 | + stats_lf.join(positions_lf, on="amplicon_name", how="left") |
103 | 123 | .select( |
104 | 124 | "sample_name", |
105 | 125 | "amplicon_name", |
106 | 126 | pl.col("start_pos").cast(pl.String).fill_null("NA"), |
107 | 127 | pl.col("end_pos").cast(pl.String).fill_null("NA"), |
108 | 128 | "reads", |
109 | 129 | ) |
| 130 | + .sort("sample_name", "amplicon_name") |
110 | 131 | .collect() |
111 | 132 | ) |
112 | 133 |
|
113 | | - result.write_csv(output_file, separator="\t") |
| 134 | + result.write_csv(output_path, separator="\t") |
| 135 | + print(f"Wrote {len(result)} rows to {output_path}", file=sys.stderr) |
114 | 136 |
|
115 | 137 |
|
116 | | -if __name__ == "__main__": |
| 138 | +def main() -> None: |
117 | 139 | parser = argparse.ArgumentParser( |
118 | | - description="Summarize amplicon coverage from stats files and BED file.", |
| 140 | + description=__doc__, |
| 141 | + formatter_class=argparse.RawDescriptionHelpFormatter, |
119 | 142 | ) |
120 | | - parser.add_argument("--bed", required=True, help="Path to BED file") |
121 | | - parser.add_argument( |
122 | | - "--output", |
123 | | - default="amplicon_summary.tsv", |
124 | | - help="Output TSV file", |
| 143 | + |
| 144 | + # Position data input (mutually exclusive) |
| 145 | + pos_group = parser.add_mutually_exclusive_group(required=True) |
| 146 | + pos_group.add_argument( |
| 147 | + "--primer-tsv", |
| 148 | + type=Path, |
| 149 | + help="Primer pairs TSV from prepare_primers.py (preferred)", |
125 | 150 | ) |
| 151 | + pos_group.add_argument( |
| 152 | + "--bed", |
| 153 | + type=Path, |
| 154 | + help="BED file with primer coordinates (legacy)", |
| 155 | + ) |
| 156 | + |
| 157 | + # Stats input |
126 | 158 | parser.add_argument( |
127 | 159 | "--pattern", |
128 | 160 | default="stats_*.tsv", |
129 | 161 | help="Glob pattern for stats files (default: stats_*.tsv)", |
130 | 162 | ) |
131 | 163 |
|
| 164 | + # Output |
| 165 | + parser.add_argument( |
| 166 | + "--output", |
| 167 | + default="amplicon_summary.tsv", |
| 168 | + help="Output TSV file (default: amplicon_summary.tsv)", |
| 169 | + ) |
| 170 | + |
| 171 | + # BED-specific options |
| 172 | + parser.add_argument( |
| 173 | + "--fwd-suffix", |
| 174 | + default="_LEFT", |
| 175 | + help="Forward primer suffix for BED parsing (default: _LEFT)", |
| 176 | + ) |
| 177 | + parser.add_argument( |
| 178 | + "--rev-suffix", |
| 179 | + default="_RIGHT", |
| 180 | + help="Reverse primer suffix for BED parsing (default: _RIGHT)", |
| 181 | + ) |
| 182 | + |
132 | 183 | args = parser.parse_args() |
133 | | - main(args.bed, args.output, args.pattern) |
| 184 | + |
| 185 | + # Load position data |
| 186 | + if args.primer_tsv: |
| 187 | + assert args.primer_tsv.is_file(), f"Primer TSV not found: {args.primer_tsv}" |
| 188 | + positions_lf = _load_positions_from_tsv(str(args.primer_tsv)) |
| 189 | + else: |
| 190 | + assert args.bed.is_file(), f"BED file not found: {args.bed}" |
| 191 | + positions_lf = _load_positions_from_bed( |
| 192 | + str(args.bed), |
| 193 | + args.fwd_suffix, |
| 194 | + args.rev_suffix, |
| 195 | + ) |
| 196 | + |
| 197 | + # Parse stats files |
| 198 | + stats_lf = _parse_stats_files(args.pattern) |
| 199 | + |
| 200 | + # Create and write summary |
| 201 | + _create_summary(stats_lf, positions_lf, args.output) |
| 202 | + |
| 203 | + |
| 204 | +if __name__ == "__main__": |
| 205 | + main() |
0 commit comments