nrminor
diff --git a/‎.gitignore‎
Lines changed: 13 additions & 0 deletions b/‎.gitignore‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎assets/multiqc_config.yaml‎
Lines changed: 47 additions & 0 deletions b/‎assets/multiqc_config.yaml‎
Lines changed: 47 additions & 0 deletions
diff --git a/‎bin/extract_metrics.py‎
Lines changed: 92 additions & 0 deletions b/‎bin/extract_metrics.py‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎bin/reporting/__init__.py‎
Lines changed: 28 additions & 0 deletions b/‎bin/reporting/__init__.py‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎bin/reporting/extractors/__init__.py‎
Lines changed: 19 additions & 0 deletions b/‎bin/reporting/extractors/__init__.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎bin/reporting/extractors/coverage.py‎
Lines changed: 143 additions & 0 deletions b/‎bin/reporting/extractors/coverage.py‎
Lines changed: 143 additions & 0 deletions
@@ -85,6 +85,19 @@
 !/bin/oneroof_cli/commands/
 !/bin/oneroof_cli/commands/*.py
 
+# reporting package (metrics extraction, visualization, report assembly)
+!/bin/reporting/
+!/bin/reporting/*.py
+!/bin/reporting/extractors/
+!/bin/reporting/extractors/*.py
+!/bin/reporting/visualizations/
+!/bin/reporting/visualizations/*.py
+
+# assets (templates, configs)
+!/assets/
+!/assets/*.yaml
+!/assets/*.yml
+
 # Rust development files for IDE support
 !/Cargo.toml
 !/Cargo.lock
 
@@ -0,0 +1,47 @@
+# MultiQC configuration for OneRoof reports
+# This file is used as a base template - dynamic values are substituted at runtime
+
+title: "OneRoof Amplicon Sequencing Report"
+subtitle: "Platform: {platform} | Reference: {reference}"
+
+# Report section order (negative = earlier)
+report_section_order:
+  oneroof_general_stats:
+    order: -1000
+  oneroof_coverage_table:
+    order: -900
+  software_versions:
+    order: -800
+
+# Search patterns for OneRoof custom content files
+sp:
+  oneroof_general_stats:
+    fn: "*_oneroof_general_stats.tsv"
+  oneroof_coverage_table:
+    fn: "*_oneroof_coverage_table.tsv"
+
+# Custom colors for QC status indicators
+custom_colours:
+  pass: "#22c55e"
+  warn: "#f59e0b"
+  fail: "#ef4444"
+
+# Table column visibility defaults
+table_columns_visible:
+  FastQC:
+    percent_duplicates: false
+    percent_gc: true
+    avg_sequence_length: true
+    percent_fails: false
+    total_sequences: true
+
+# Don't include these in the report
+exclude_modules:
+  - "snippy"
+
+# Clean up sample names
+fn_clean_exts:
+  - ".per-base"
+  - ".consensus"
+  - "_coverage_metrics"
+  - "_variant_effects"
@@ -0,0 +1,92 @@
+#!/usr/bin/env python3
+"""
+Extract metrics from OneRoof pipeline outputs.
+
+This CLI provides subcommands for extracting metrics from various pipeline
+stages into validated JSON files. These per-sample JSON files are later
+assembled into the final OneRoof report.
+
+Usage:
+    extract_metrics.py coverage --sample-id SAMPLE --bed coverage.bed --output metrics.json
+    extract_metrics.py --help
+
+Future subcommands (Phase 2):
+    extract_metrics.py alignment --sample-id SAMPLE --bam aligned.bam --output metrics.json
+    extract_metrics.py variants --sample-id SAMPLE --effects-tsv variants.tsv --output metrics.json
+    extract_metrics.py consensus --sample-id SAMPLE --fasta consensus.fa --output metrics.json
+"""
+
+from pathlib import Path
+from typing import Annotated
+
+import typer
+from rich.console import Console
+
+from reporting.extractors.coverage import extract as extract_coverage
+
+app = typer.Typer(
+    name="extract_metrics",
+    help="Extract metrics from OneRoof pipeline outputs into validated JSON.",
+    add_completion=False,
+    rich_markup_mode="rich",
+    no_args_is_help=True,
+    context_settings={"help_option_names": ["-h", "--help"]},
+)
+console = Console()
+
+
+@app.callback()
+def main() -> None:
+    """
+    Extract metrics from OneRoof pipeline outputs into validated JSON.
+
+    Each subcommand extracts metrics from a specific pipeline stage.
+    """
+    pass
+
+
+@app.command("coverage")
+def coverage(
+    sample_id: Annotated[
+        str,
+        typer.Option("--sample-id", "-s", help="Sample identifier"),
+    ],
+    bed: Annotated[
+        Path,
+        typer.Option(
+            "--bed",
+            "-b",
+            help="Path to bedtools genomecov BED file",
+            exists=True,
+            file_okay=True,
+            dir_okay=False,
+            readable=True,
+            resolve_path=True,
+        ),
+    ],
+    output: Annotated[
+        Path,
+        typer.Option(
+            "--output",
+            "-o",
+            help="Output JSON file path",
+            file_okay=True,
+            dir_okay=False,
+            resolve_path=True,
+        ),
+    ],
+) -> None:
+    """
+    Extract coverage metrics from bedtools genomecov output.
+
+    Parses a per-base BED file (from `bedtools genomecov -bga`) and computes
+    coverage statistics including mean/median depth and genome coverage at
+    various thresholds.
+    """
+    metrics = extract_coverage(sample_id, bed)
+    output.write_text(metrics.model_dump_json(indent=2))
+    console.print(f"[green]Wrote coverage metrics to {output}[/green]")
+
+
+if __name__ == "__main__":
+    app()
@@ -0,0 +1,28 @@
+"""
+OneRoof Reporting Package.
+
+This package provides metrics extraction, report assembly, and visualization
+generation for the OneRoof amplicon sequencing pipeline.
+
+Subpackages:
+    extractors: Per-sample metrics extraction from pipeline outputs
+    visualizations: Altair-based chart generation (Phase 1+)
+
+Modules:
+    schema: Pydantic models defining the canonical report JSON structure
+    multiqc: MultiQC custom content file generation (Phase 1+)
+"""
+
+from .multiqc import (
+    DEFAULT_MULTIQC_TEMPLATE,
+    generate_coverage_table_tsv,
+    generate_general_stats_tsv,
+    generate_multiqc_config,
+)
+
+__all__ = [
+    "DEFAULT_MULTIQC_TEMPLATE",
+    "generate_coverage_table_tsv",
+    "generate_general_stats_tsv",
+    "generate_multiqc_config",
+]
@@ -0,0 +1,19 @@
+"""
+Metrics Extractors for OneRoof Reporting.
+
+Each extractor module parses output from a specific pipeline stage and produces
+a validated Pydantic model containing the extracted metrics. These per-sample
+JSON files are later assembled into the final OneRoof report.
+
+Extractors:
+    coverage: Parse bedtools genomecov output for coverage statistics
+    alignment: Parse BAM files for read mapping statistics (Phase 2)
+    variants: Parse SnpSift variant effects TSV (Phase 2)
+    consensus: Parse consensus FASTA for sequence statistics (Phase 2)
+    metagenomics: Parse Sylph profile output (Phase 2)
+    haplotyping: Parse Devider output for haplotype statistics (Phase 2, ONT only)
+"""
+
+from .coverage import extract as extract_coverage
+
+__all__ = ["extract_coverage"]
@@ -0,0 +1,143 @@
+"""
+Coverage metrics extractor for OneRoof reporting.
+
+Parses bedtools genomecov output (BED format with -bga flag) to compute
+coverage statistics for a sample. The output is a per-base BED file with
+columns: chrom, start, end, depth.
+
+Example input (from `bedtools genomecov -bga`):
+    MN908947.3    0       54      0
+    MN908947.3    54      100     15
+    MN908947.3    100     200     150
+    ...
+"""
+
+from pathlib import Path
+
+import polars as pl
+from pydantic import BaseModel, Field
+
+
+class CoverageMetrics(BaseModel):
+    """Intermediate coverage metrics extracted from genomecov BED output."""
+
+    sample_id: str
+    total_bases: int = Field(ge=0, description="Total bases in reference")
+    mean_coverage: float = Field(ge=0, description="Weighted mean depth")
+    median_coverage: float = Field(ge=0, description="Weighted median depth")
+    genome_coverage_at_1x: float = Field(
+        ge=0, le=1, description="Fraction of genome with ≥1x coverage"
+    )
+    genome_coverage_at_10x: float = Field(
+        ge=0, le=1, description="Fraction of genome with ≥10x coverage"
+    )
+    genome_coverage_at_100x: float = Field(
+        ge=0, le=1, description="Fraction of genome with ≥100x coverage"
+    )
+    min_coverage: int = Field(ge=0, description="Minimum depth observed")
+    max_coverage: int = Field(ge=0, description="Maximum depth observed")
+
+
+def load_coverage_bed(bed_path: Path) -> pl.DataFrame:
+    """
+    Load a bedtools genomecov BED file.
+
+    Args:
+        bed_path: Path to the per-base BED file from `bedtools genomecov -bga`
+
+    Returns:
+        DataFrame with columns: chrom, start, end, depth
+    """
+    return pl.read_csv(
+        bed_path,
+        separator="\t",
+        has_header=False,
+        new_columns=["chrom", "start", "end", "depth"],
+        schema={
+            "chrom": pl.Utf8,
+            "start": pl.Int64,
+            "end": pl.Int64,
+            "depth": pl.Int64,
+        },
+    )
+
+
+def compute_coverage_stats(df: pl.DataFrame) -> dict:
+    """
+    Compute coverage statistics from a genomecov DataFrame.
+
+    Args:
+        df: DataFrame with columns: chrom, start, end, depth
+
+    Returns:
+        Dictionary with computed coverage statistics
+    """
+    # Calculate region lengths
+    df = df.with_columns((pl.col("end") - pl.col("start")).alias("length"))
+
+    total_bases = df["length"].sum()
+
+    if total_bases == 0:
+        return {
+            "total_bases": 0,
+            "mean_coverage": 0.0,
+            "median_coverage": 0.0,
+            "genome_coverage_at_1x": 0.0,
+            "genome_coverage_at_10x": 0.0,
+            "genome_coverage_at_100x": 0.0,
+            "min_coverage": 0,
+            "max_coverage": 0,
+        }
+
+    # Weighted mean: sum(depth * length) / total_bases
+    weighted_sum = (df["depth"] * df["length"]).sum()
+    mean_coverage = weighted_sum / total_bases
+
+    # Weighted median: expand depths by length and find median
+    # For efficiency with large genomes, we compute this from the cumulative distribution
+    sorted_df = df.sort("depth")
+    sorted_df = sorted_df.with_columns(
+        (pl.col("length").cum_sum() / total_bases).alias("cumulative_frac")
+    )
+    # Find the first row where cumulative fraction >= 0.5
+    median_row = sorted_df.filter(pl.col("cumulative_frac") >= 0.5).head(1)
+    median_coverage = float(median_row["depth"][0]) if len(median_row) > 0 else 0.0
+
+    # Coverage at thresholds
+    bases_at_1x = df.filter(pl.col("depth") >= 1)["length"].sum()
+    bases_at_10x = df.filter(pl.col("depth") >= 10)["length"].sum()
+    bases_at_100x = df.filter(pl.col("depth") >= 100)["length"].sum()
+
+    # Min and max coverage
+    # Note: Polars min/max return PythonLiteral which includes int at runtime
+    min_cov_value = df["depth"].min()
+    max_cov_value = df["depth"].max()
+    min_coverage = 0 if min_cov_value is None else int(min_cov_value)  # type: ignore[arg-type]
+    max_coverage = 0 if max_cov_value is None else int(max_cov_value)  # type: ignore[arg-type]
+
+    return {
+        "total_bases": int(total_bases),
+        "mean_coverage": float(mean_coverage),
+        "median_coverage": float(median_coverage),
+        "genome_coverage_at_1x": float(bases_at_1x / total_bases),
+        "genome_coverage_at_10x": float(bases_at_10x / total_bases),
+        "genome_coverage_at_100x": float(bases_at_100x / total_bases),
+        "min_coverage": min_coverage,
+        "max_coverage": max_coverage,
+    }
+
+
+def extract(sample_id: str, bed_path: Path) -> CoverageMetrics:
+    """
+    Extract coverage metrics from a bedtools genomecov BED file.
+
+    Args:
+        sample_id: Sample identifier
+        bed_path: Path to the per-base BED file
+
+    Returns:
+        Validated CoverageMetrics model
+    """
+    df = load_coverage_bed(bed_path)
+    stats = compute_coverage_stats(df)
+    return CoverageMetrics(sample_id=sample_id, **stats)