Skip to content

Commit 036cfe7

Browse files
committed
widespread updates across the codebase to reduce dependencies and support the new primer handling core
1 parent b17a221 commit 036cfe7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+3147
-2915
lines changed

.github/workflows/docker-image.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,9 @@ on:
99
- 'Containerfile'
1010
- 'pyproject.toml'
1111
- 'pixi.lock'
12+
- 'Cargo.toml'
13+
- 'Cargo.lock'
14+
- 'bin/find_and_trim_amplicons.rs'
1215
pull_request:
1316
branches: [ "main" ]
1417
paths:
@@ -17,6 +20,9 @@ on:
1720
- 'Containerfile'
1821
- 'pyproject.toml'
1922
- 'pixi.lock'
23+
- 'Cargo.toml'
24+
- 'Cargo.lock'
25+
- 'bin/find_and_trim_amplicons.rs'
2026

2127
jobs:
2228

Cargo.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,8 @@ flate2 = "1.0"
3636

3737
# Note: Individual scripts may have additional dependencies
3838
# specified in their inline cargo metadata blocks
39+
40+
[profile.release]
41+
lto = true
42+
codegen-units = 1
43+
panic = "abort"

Containerfile

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,13 @@ RUN echo "export PATH=$PATH:$HOME/.pixi/envs/default/bin" >> $HOME/.bashrc
6464
# 6) modify some nextflow environment variables
6565
RUN echo "export NXF_CACHE_DIR=/scratch" >> $HOME/.bashrc
6666
RUN echo "export NXF_HOME=/scratch" >> $HOME/.bashrc
67+
68+
# Pre-compile Rust scripts for container use
69+
# --------------------------------
70+
# This avoids rust-script JIT compilation overhead on every process invocation.
71+
# Local (non-container) runs still use rust-script directly via the .rs files.
72+
COPY Cargo.toml $HOME/Cargo.toml
73+
COPY Cargo.lock $HOME/Cargo.lock
74+
COPY bin/find_and_trim_amplicons.rs $HOME/bin/find_and_trim_amplicons.rs
75+
RUN cd $HOME && $HOME/.pixi/envs/default/bin/cargo build --release && \
76+
cp $HOME/target/release/find_and_trim_amplicons $HOME/.pixi/envs/default/bin/

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ Most users should configure `oneroof` through the command line via the following
9090
| `--max_len` | 12345678910 | The maximum acceptable length for a given read. |
9191
| `--min_len` | 1 | The minimum acceptable length for a given read. |
9292
| `--min_qual` | 0 | The minimum acceptable average quality for a given read. |
93-
| `--max_mismatch` | 0 | The maximum number of mismatches to allow when finding primers |
93+
| `--max_mismatch` | 0 (illumina) or 2 (nanopore) | The maximum number of mismatches to allow when finding primers |
9494
| `--downsample_to` | 0 | Desired coverage to downsample to, with a special value of 0 to indicate 0 downsampling |
9595
| `--secondary` | None | Whether to turn on secondary alignments for each amplicon. Secondary alignments can increase depth at the cost of more reads potentially mapping to the wrong locations. By default, secondary alignments are off. |
9696
| `--min_consensus_freq` | 0.5 | The minimum required frequency of a variant base to be included in a consensu sequence. |

_quarto.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ project:
44
render:
55
- "docs/*.qmd"
66
post-render:
7-
- docs/fix-index-paths.lua
7+
- docs/fix-index-paths.py
88

99
website:
1010
title: "OneRoof Documentation"

bin/create_amplicon_tsv.py

Lines changed: 103 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1,81 +1,116 @@
1-
#!/usr/bin/env python
1+
#!/usr/bin/env python3
2+
# /// script
3+
# requires-python = ">=3.11"
4+
# dependencies = [
5+
# "polars",
6+
# ]
7+
# ///
28

3-
import argparse
4-
import glob
5-
import re
6-
7-
import pandas as pd
8-
9-
10-
def extract_sample_name(filename):
11-
base_name = filename.replace(".QIAseq_", "_QIAseq_")
12-
if "_QIAseq_" in base_name:
13-
return base_name.split("_QIAseq_")[0]
14-
return base_name.split(".")[0]
15-
16-
17-
def extract_amplicon_name(filename):
18-
match = re.search(r"(QIAseq_[^._\s]+)", filename)
19-
if match:
20-
return match.group(1)
21-
return "unknown"
9+
"""Summarize amplicon coverage from stats files and BED file."""
2210

11+
from __future__ import annotations
2312

24-
def get_amplicon_positions(bed_df, amplicon_name):
25-
# Match base form (e.g., QIAseq_92 from QIAseq_92_splice9)
26-
base_amplicon = re.sub(r"[_-](splice\d+|\d+)$", "", amplicon_name)
27-
28-
# Find matching LEFT and RIGHT primers (include splices)
29-
left_primers = bed_df[
30-
bed_df["name"].str.contains(f"{base_amplicon}_LEFT", na=False)
31-
]
32-
right_primers = bed_df[
33-
bed_df["name"].str.contains(f"{base_amplicon}_RIGHT", na=False)
34-
]
35-
36-
if not left_primers.empty and not right_primers.empty:
37-
start_pos = left_primers["start"].min()
38-
end_pos = right_primers["end"].max()
39-
return start_pos, end_pos
40-
41-
return None, None
42-
13+
import argparse
4314

44-
def main(bed_file, output_file, stats_pattern):
45-
# Read stats files
46-
stats_files = glob.glob(stats_pattern)
47-
if not stats_files:
48-
raise FileNotFoundError(
49-
f"No stats files found matching pattern: {stats_pattern}",
15+
import polars as pl
16+
17+
18+
def main(bed_file: str, output_file: str, stats_pattern: str) -> None:
19+
"""
20+
Create amplicon summary TSV from stats files and BED file.
21+
22+
Reads all stats files matching the glob pattern, extracts sample and amplicon
23+
names from the 'file' column, joins with BED file to get amplicon positions,
24+
and writes a summary TSV.
25+
26+
Parameters:
27+
bed_file: Path to BED file with primer positions
28+
output_file: Path to output TSV file
29+
stats_pattern: Glob pattern for stats files (e.g., "stats_*.tsv")
30+
"""
31+
# Read all stats files with glob, extract info from the 'file' column
32+
# (which contains filenames like "SAMPLE.QIAseq_X-Y.no_downsampling.fasta.gz")
33+
stats = (
34+
pl.scan_csv(
35+
stats_pattern,
36+
separator="\t",
37+
glob=True,
5038
)
39+
.with_columns(
40+
# Normalize .QIAseq_ to _QIAseq_ for consistent parsing
41+
pl.col("file")
42+
.str.replace(r"\.QIAseq_", "_QIAseq_")
43+
.alias("normalized_file")
44+
)
45+
.with_columns(
46+
# Extract sample name: everything before _QIAseq_, or first dot-segment
47+
pl.when(pl.col("normalized_file").str.contains("_QIAseq_"))
48+
.then(
49+
pl.col("normalized_file").str.extract(
50+
r"^([^_]+(?:_[^_]+)*?)_QIAseq_", group_index=1
51+
)
52+
)
53+
.otherwise(
54+
pl.col("normalized_file").str.extract(r"^([^.]+)", group_index=1)
55+
)
56+
.alias("sample_name"),
57+
# Extract amplicon name: QIAseq_XXX (including any suffix like -1)
58+
pl.col("file")
59+
.str.extract(r"(QIAseq_[^.]+)", group_index=1)
60+
.alias("amplicon_name"),
61+
# Extract base amplicon for joining: QIAseq_N (just the number, no suffix)
62+
pl.col("file")
63+
.str.extract(r"(QIAseq_\d+)", group_index=1)
64+
.alias("base_amplicon"),
65+
)
66+
.select(
67+
"sample_name",
68+
"amplicon_name",
69+
"base_amplicon",
70+
pl.col("num_seqs").alias("reads"),
71+
)
72+
)
5173

52-
all_stats_data = [pd.read_csv(f, sep="\t") for f in stats_files]
53-
combined_stats_df = pd.concat(all_stats_data, ignore_index=True)
54-
55-
# Read BED file
56-
bed_columns = ["chrom", "start", "end", "name", "score", "strand"]
57-
bed_df = pd.read_csv(bed_file, sep="\t", names=bed_columns, header=None)
74+
# Read BED file, compute amplicon start/end positions
75+
bed = (
76+
pl.scan_csv(
77+
bed_file,
78+
separator="\t",
79+
has_header=False,
80+
new_columns=["chrom", "start", "end", "name", "score", "strand"],
81+
)
82+
.with_columns(
83+
# Extract base amplicon from primer name (e.g., QIAseq_2_LEFT -> QIAseq_2)
84+
pl.col("name")
85+
.str.extract(r"(QIAseq_\d+)", group_index=1)
86+
.alias("base_amplicon"),
87+
# Flag LEFT vs RIGHT primers
88+
pl.col("name").str.contains("_LEFT").alias("is_left"),
89+
pl.col("name").str.contains("_RIGHT").alias("is_right"),
90+
)
91+
.filter(pl.col("base_amplicon").is_not_null())
92+
)
5893

59-
output_data = []
60-
for _, row in combined_stats_df.iterrows():
61-
filename = row["file"]
62-
sample_name = extract_sample_name(filename)
63-
amplicon_name = extract_amplicon_name(filename)
64-
reads = row["num_seqs"]
65-
start_pos, end_pos = get_amplicon_positions(bed_df, amplicon_name)
94+
# Aggregate to get min(start) for LEFT primers, max(end) for RIGHT primers
95+
amplicon_positions = bed.group_by("base_amplicon").agg(
96+
pl.col("start").filter(pl.col("is_left")).min().alias("start_pos"),
97+
pl.col("end").filter(pl.col("is_right")).max().alias("end_pos"),
98+
)
6699

67-
output_data.append(
68-
{
69-
"sample_name": sample_name,
70-
"amplicon_name": amplicon_name,
71-
"start_pos": start_pos if start_pos is not None else "NA",
72-
"end_pos": end_pos if end_pos is not None else "NA",
73-
"reads": reads,
74-
},
100+
# Join stats with positions and format output
101+
result = (
102+
stats.join(amplicon_positions, on="base_amplicon", how="left")
103+
.select(
104+
"sample_name",
105+
"amplicon_name",
106+
pl.col("start_pos").cast(pl.String).fill_null("NA"),
107+
pl.col("end_pos").cast(pl.String).fill_null("NA"),
108+
"reads",
75109
)
110+
.collect()
111+
)
76112

77-
output_df = pd.DataFrame(output_data)
78-
output_df.to_csv(output_file, sep="\t", index=False)
113+
result.write_csv(output_file, separator="\t")
79114

80115

81116
if __name__ == "__main__":

0 commit comments

Comments
 (0)