Skip to content

Commit 211a475

Browse files
committed
adding the foundation for a new reporting system that will integrate with multiqc and make data available for other frontends
1 parent c857224 commit 211a475

File tree

17 files changed

+1557
-25
lines changed

17 files changed

+1557
-25
lines changed

.gitignore

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,19 @@
8585
!/bin/oneroof_cli/commands/
8686
!/bin/oneroof_cli/commands/*.py
8787

88+
# reporting package (metrics extraction, visualization, report assembly)
89+
!/bin/reporting/
90+
!/bin/reporting/*.py
91+
!/bin/reporting/extractors/
92+
!/bin/reporting/extractors/*.py
93+
!/bin/reporting/visualizations/
94+
!/bin/reporting/visualizations/*.py
95+
96+
# assets (templates, configs)
97+
!/assets/
98+
!/assets/*.yaml
99+
!/assets/*.yml
100+
88101
# Rust development files for IDE support
89102
!/Cargo.toml
90103
!/Cargo.lock

assets/multiqc_config.yaml

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# MultiQC configuration for OneRoof reports
2+
# This file is used as a base template - dynamic values are substituted at runtime
3+
4+
title: "OneRoof Amplicon Sequencing Report"
5+
subtitle: "Platform: {platform} | Reference: {reference}"
6+
7+
# Report section order (negative = earlier)
8+
report_section_order:
9+
oneroof_general_stats:
10+
order: -1000
11+
oneroof_coverage_table:
12+
order: -900
13+
software_versions:
14+
order: -800
15+
16+
# Search patterns for OneRoof custom content files
17+
sp:
18+
oneroof_general_stats:
19+
fn: "*_oneroof_general_stats.tsv"
20+
oneroof_coverage_table:
21+
fn: "*_oneroof_coverage_table.tsv"
22+
23+
# Custom colors for QC status indicators
24+
custom_colours:
25+
pass: "#22c55e"
26+
warn: "#f59e0b"
27+
fail: "#ef4444"
28+
29+
# Table column visibility defaults
30+
table_columns_visible:
31+
FastQC:
32+
percent_duplicates: false
33+
percent_gc: true
34+
avg_sequence_length: true
35+
percent_fails: false
36+
total_sequences: true
37+
38+
# Don't include these in the report
39+
exclude_modules:
40+
- "snippy"
41+
42+
# Clean up sample names
43+
fn_clean_exts:
44+
- ".per-base"
45+
- ".consensus"
46+
- "_coverage_metrics"
47+
- "_variant_effects"

bin/extract_metrics.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#!/usr/bin/env python3
2+
"""
3+
Extract metrics from OneRoof pipeline outputs.
4+
5+
This CLI provides subcommands for extracting metrics from various pipeline
6+
stages into validated JSON files. These per-sample JSON files are later
7+
assembled into the final OneRoof report.
8+
9+
Usage:
10+
extract_metrics.py coverage --sample-id SAMPLE --bed coverage.bed --output metrics.json
11+
extract_metrics.py --help
12+
13+
Future subcommands (Phase 2):
14+
extract_metrics.py alignment --sample-id SAMPLE --bam aligned.bam --output metrics.json
15+
extract_metrics.py variants --sample-id SAMPLE --effects-tsv variants.tsv --output metrics.json
16+
extract_metrics.py consensus --sample-id SAMPLE --fasta consensus.fa --output metrics.json
17+
"""
18+
19+
from pathlib import Path
20+
from typing import Annotated
21+
22+
import typer
23+
from rich.console import Console
24+
25+
from reporting.extractors.coverage import extract as extract_coverage
26+
27+
app = typer.Typer(
28+
name="extract_metrics",
29+
help="Extract metrics from OneRoof pipeline outputs into validated JSON.",
30+
add_completion=False,
31+
rich_markup_mode="rich",
32+
no_args_is_help=True,
33+
context_settings={"help_option_names": ["-h", "--help"]},
34+
)
35+
console = Console()
36+
37+
38+
@app.callback()
39+
def main() -> None:
40+
"""
41+
Extract metrics from OneRoof pipeline outputs into validated JSON.
42+
43+
Each subcommand extracts metrics from a specific pipeline stage.
44+
"""
45+
pass
46+
47+
48+
@app.command("coverage")
49+
def coverage(
50+
sample_id: Annotated[
51+
str,
52+
typer.Option("--sample-id", "-s", help="Sample identifier"),
53+
],
54+
bed: Annotated[
55+
Path,
56+
typer.Option(
57+
"--bed",
58+
"-b",
59+
help="Path to bedtools genomecov BED file",
60+
exists=True,
61+
file_okay=True,
62+
dir_okay=False,
63+
readable=True,
64+
resolve_path=True,
65+
),
66+
],
67+
output: Annotated[
68+
Path,
69+
typer.Option(
70+
"--output",
71+
"-o",
72+
help="Output JSON file path",
73+
file_okay=True,
74+
dir_okay=False,
75+
resolve_path=True,
76+
),
77+
],
78+
) -> None:
79+
"""
80+
Extract coverage metrics from bedtools genomecov output.
81+
82+
Parses a per-base BED file (from `bedtools genomecov -bga`) and computes
83+
coverage statistics including mean/median depth and genome coverage at
84+
various thresholds.
85+
"""
86+
metrics = extract_coverage(sample_id, bed)
87+
output.write_text(metrics.model_dump_json(indent=2))
88+
console.print(f"[green]Wrote coverage metrics to {output}[/green]")
89+
90+
91+
if __name__ == "__main__":
92+
app()

bin/reporting/__init__.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
"""
2+
OneRoof Reporting Package.
3+
4+
This package provides metrics extraction, report assembly, and visualization
5+
generation for the OneRoof amplicon sequencing pipeline.
6+
7+
Subpackages:
8+
extractors: Per-sample metrics extraction from pipeline outputs
9+
visualizations: Altair-based chart generation (Phase 1+)
10+
11+
Modules:
12+
schema: Pydantic models defining the canonical report JSON structure
13+
multiqc: MultiQC custom content file generation (Phase 1+)
14+
"""
15+
16+
from .multiqc import (
17+
DEFAULT_MULTIQC_TEMPLATE,
18+
generate_coverage_table_tsv,
19+
generate_general_stats_tsv,
20+
generate_multiqc_config,
21+
)
22+
23+
__all__ = [
24+
"DEFAULT_MULTIQC_TEMPLATE",
25+
"generate_coverage_table_tsv",
26+
"generate_general_stats_tsv",
27+
"generate_multiqc_config",
28+
]
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
"""
2+
Metrics Extractors for OneRoof Reporting.
3+
4+
Each extractor module parses output from a specific pipeline stage and produces
5+
a validated Pydantic model containing the extracted metrics. These per-sample
6+
JSON files are later assembled into the final OneRoof report.
7+
8+
Extractors:
9+
coverage: Parse bedtools genomecov output for coverage statistics
10+
alignment: Parse BAM files for read mapping statistics (Phase 2)
11+
variants: Parse SnpSift variant effects TSV (Phase 2)
12+
consensus: Parse consensus FASTA for sequence statistics (Phase 2)
13+
metagenomics: Parse Sylph profile output (Phase 2)
14+
haplotyping: Parse Devider output for haplotype statistics (Phase 2, ONT only)
15+
"""
16+
17+
from .coverage import extract as extract_coverage
18+
19+
__all__ = ["extract_coverage"]
Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,143 @@
1+
"""
2+
Coverage metrics extractor for OneRoof reporting.
3+
4+
Parses bedtools genomecov output (BED format with -bga flag) to compute
5+
coverage statistics for a sample. The output is a per-base BED file with
6+
columns: chrom, start, end, depth.
7+
8+
Example input (from `bedtools genomecov -bga`):
9+
MN908947.3 0 54 0
10+
MN908947.3 54 100 15
11+
MN908947.3 100 200 150
12+
...
13+
"""
14+
15+
from pathlib import Path
16+
17+
import polars as pl
18+
from pydantic import BaseModel, Field
19+
20+
21+
class CoverageMetrics(BaseModel):
22+
"""Intermediate coverage metrics extracted from genomecov BED output."""
23+
24+
sample_id: str
25+
total_bases: int = Field(ge=0, description="Total bases in reference")
26+
mean_coverage: float = Field(ge=0, description="Weighted mean depth")
27+
median_coverage: float = Field(ge=0, description="Weighted median depth")
28+
genome_coverage_at_1x: float = Field(
29+
ge=0, le=1, description="Fraction of genome with ≥1x coverage"
30+
)
31+
genome_coverage_at_10x: float = Field(
32+
ge=0, le=1, description="Fraction of genome with ≥10x coverage"
33+
)
34+
genome_coverage_at_100x: float = Field(
35+
ge=0, le=1, description="Fraction of genome with ≥100x coverage"
36+
)
37+
min_coverage: int = Field(ge=0, description="Minimum depth observed")
38+
max_coverage: int = Field(ge=0, description="Maximum depth observed")
39+
40+
41+
def load_coverage_bed(bed_path: Path) -> pl.DataFrame:
42+
"""
43+
Load a bedtools genomecov BED file.
44+
45+
Args:
46+
bed_path: Path to the per-base BED file from `bedtools genomecov -bga`
47+
48+
Returns:
49+
DataFrame with columns: chrom, start, end, depth
50+
"""
51+
return pl.read_csv(
52+
bed_path,
53+
separator="\t",
54+
has_header=False,
55+
new_columns=["chrom", "start", "end", "depth"],
56+
schema={
57+
"chrom": pl.Utf8,
58+
"start": pl.Int64,
59+
"end": pl.Int64,
60+
"depth": pl.Int64,
61+
},
62+
)
63+
64+
65+
def compute_coverage_stats(df: pl.DataFrame) -> dict:
66+
"""
67+
Compute coverage statistics from a genomecov DataFrame.
68+
69+
Args:
70+
df: DataFrame with columns: chrom, start, end, depth
71+
72+
Returns:
73+
Dictionary with computed coverage statistics
74+
"""
75+
# Calculate region lengths
76+
df = df.with_columns((pl.col("end") - pl.col("start")).alias("length"))
77+
78+
total_bases = df["length"].sum()
79+
80+
if total_bases == 0:
81+
return {
82+
"total_bases": 0,
83+
"mean_coverage": 0.0,
84+
"median_coverage": 0.0,
85+
"genome_coverage_at_1x": 0.0,
86+
"genome_coverage_at_10x": 0.0,
87+
"genome_coverage_at_100x": 0.0,
88+
"min_coverage": 0,
89+
"max_coverage": 0,
90+
}
91+
92+
# Weighted mean: sum(depth * length) / total_bases
93+
weighted_sum = (df["depth"] * df["length"]).sum()
94+
mean_coverage = weighted_sum / total_bases
95+
96+
# Weighted median: expand depths by length and find median
97+
# For efficiency with large genomes, we compute this from the cumulative distribution
98+
sorted_df = df.sort("depth")
99+
sorted_df = sorted_df.with_columns(
100+
(pl.col("length").cum_sum() / total_bases).alias("cumulative_frac")
101+
)
102+
# Find the first row where cumulative fraction >= 0.5
103+
median_row = sorted_df.filter(pl.col("cumulative_frac") >= 0.5).head(1)
104+
median_coverage = float(median_row["depth"][0]) if len(median_row) > 0 else 0.0
105+
106+
# Coverage at thresholds
107+
bases_at_1x = df.filter(pl.col("depth") >= 1)["length"].sum()
108+
bases_at_10x = df.filter(pl.col("depth") >= 10)["length"].sum()
109+
bases_at_100x = df.filter(pl.col("depth") >= 100)["length"].sum()
110+
111+
# Min and max coverage
112+
# Note: Polars min/max return PythonLiteral which includes int at runtime
113+
min_cov_value = df["depth"].min()
114+
max_cov_value = df["depth"].max()
115+
min_coverage = 0 if min_cov_value is None else int(min_cov_value) # type: ignore[arg-type]
116+
max_coverage = 0 if max_cov_value is None else int(max_cov_value) # type: ignore[arg-type]
117+
118+
return {
119+
"total_bases": int(total_bases),
120+
"mean_coverage": float(mean_coverage),
121+
"median_coverage": float(median_coverage),
122+
"genome_coverage_at_1x": float(bases_at_1x / total_bases),
123+
"genome_coverage_at_10x": float(bases_at_10x / total_bases),
124+
"genome_coverage_at_100x": float(bases_at_100x / total_bases),
125+
"min_coverage": min_coverage,
126+
"max_coverage": max_coverage,
127+
}
128+
129+
130+
def extract(sample_id: str, bed_path: Path) -> CoverageMetrics:
131+
"""
132+
Extract coverage metrics from a bedtools genomecov BED file.
133+
134+
Args:
135+
sample_id: Sample identifier
136+
bed_path: Path to the per-base BED file
137+
138+
Returns:
139+
Validated CoverageMetrics model
140+
"""
141+
df = load_coverage_bed(bed_path)
142+
stats = compute_coverage_stats(df)
143+
return CoverageMetrics(sample_id=sample_id, **stats)

0 commit comments

Comments
 (0)