more tests

Mike Lee · Mike Lee · commit f96bd696e46c · 2025-08-29T16:06:27.000-04:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,7 +16,7 @@
 
 ### Added
 - more test coverage of `bit-ez-screen`
-- unit tests for `bit-gen-kraken2-tax-plots`
+- unit tests for `bit-gen-kraken2-tax-plots` and `bit-kraken2-to-taxon-summaries`
 - integration test for `bit-cov-analyzer`
 
 ### Changed
diff --git a/bit/tests/test_gen_kraken2_tax_plots.py b/bit/tests/test_gen_kraken2_tax_plots.py
@@ -1,4 +1,3 @@
-import io
 from pathlib import Path
 import pandas as pd
 import pytest
diff --git a/bit/tests/test_input_parsing.py b/bit/tests/test_input_parsing.py
@@ -0,0 +1,133 @@
+import pytest
+from bit.modules import input_parsing as m
+
+
+class PrematureExit(Exception):
+    """Raised by our monkeypatched notify_premature_exit to test failure paths."""
+
+@pytest.fixture(autouse=True)
+def patch_failure_hooks(monkeypatch):
+    # capture messages sent to report_message; raise on premature exit
+    messages = []
+
+    def fake_report_message(msg, initial_indent="", subsequent_indent=""):
+        messages.append(("report", msg, initial_indent, subsequent_indent))
+
+    def fake_notify_premature_exit():
+        raise PrematureExit("premature-exit")
+
+    monkeypatch.setattr(m, "report_message", fake_report_message)
+    monkeypatch.setattr(m, "notify_premature_exit", fake_notify_premature_exit)
+
+    return messages
+
+
+@pytest.mark.parametrize("fname,expected", [
+    ("sample_R1_.fastq.gz", ("sample", "R1")),
+    ("sample_R2_.fastq.gz", ("sample", "R2")),
+    ("sample-R1-.fq.gz",    ("sample", "R1")),
+    ("sample-R2-.fq.gz",    ("sample", "R2")),
+    ("sample.R1..fastq.gz", ("sample", "R1")),
+    ("sample.R2..fastq.gz", ("sample", "R2")),
+    ("sample_1..fastq.gz",  ("sample", "R1")),
+    ("sample_2..fastq.gz",  ("sample", "R2")),
+    ("no_tag.fastq.gz",     (None, None)),
+])
+def test_parse_read_filename_variants(fname, expected):
+    assert m.parse_read_filename(fname) == expected
+
+
+def test_validate_extension_accepts_known_extensions(tmp_path):
+    for ext in m.accepted_read_extensions:
+        p = tmp_path / f"x{ext}"
+        p.write_text("data")
+        m.validate_extension(p)
+
+def test_validate_extension_rejects_unknown_extension(tmp_path):
+    p = tmp_path / "x.fq"  # not gzipped → reject
+    p.write_text("data")
+    with pytest.raises(PrematureExit):
+        m.validate_extension(p)
+
+
+def test_get_input_reads_dict_from_paths_happy_path(tmp_path):
+    r1 = tmp_path / "samp_R1_.fastq.gz"
+    r2 = tmp_path / "samp_R2_.fastq.gz"
+    r1.write_text("r1")
+    r2.write_text("r2")
+
+    out = m.get_input_reads_dict_from_paths(r1, r2)
+    assert list(out.keys()) == ["samp"]
+    assert set(out["samp"].keys()) == {"R1", "R2"}
+    assert out["samp"]["R1"] == str(r1.resolve())
+    assert out["samp"]["R2"] == str(r2.resolve())
+
+def test_get_input_reads_dict_from_paths_missing_designation_calls_exit(tmp_path):
+    bad = tmp_path / "samp.fastq.gz"  # no R1/R2 tag
+    bad.write_text("x")
+    with pytest.raises(PrematureExit):
+        m.get_input_reads_dict_from_paths(bad, None)
+
+
+def test_get_input_reads_dict_from_paths_wrong_slot_calls_exit(tmp_path):
+    # file says R2 but provided as R1 argument
+    wrong = tmp_path / "samp_R2_.fastq.gz"
+    wrong.write_text("x")
+    with pytest.raises(PrematureExit):
+        m.get_input_reads_dict_from_paths(wrong, None)
+
+
+def test_get_input_reads_dict_from_dir_pairs_samples_and_ignores_noise(tmp_path):
+    # A complete pair
+    (tmp_path / "A_R1_.fastq.gz").write_text("a1")
+    (tmp_path / "A_R2_.fastq.gz").write_text("a2")
+    # B has extra unrelated files that should be ignored
+    (tmp_path / "notes.txt").write_text("ignore me")
+    (tmp_path / "weird.fq").write_text("ignore me")  # bad ext
+    (tmp_path / "junk.fastq.gz").write_text("no R tag")  # will be parsed as None,None and skipped
+
+    out = m.get_input_reads_dict_from_dir(tmp_path)
+    assert list(out.keys()) == ["A"]
+    assert set(out["A"].keys()) == {"R1", "R2"}
+
+def test_get_input_reads_dict_from_dir_detects_incomplete_pairs_and_exits(tmp_path, patch_failure_hooks):
+    # Only R1 present for B → should trigger error
+    (tmp_path / "B_R1_.fastq.gz").write_text("b1")
+    with pytest.raises(PrematureExit):
+        m.get_input_reads_dict_from_dir(tmp_path)
+
+    # Ensure a diagnostic report_message was sent
+    reports = [x for x in patch_failure_hooks if x[0] == "report"]
+    assert reports, "expected report_message() to be called"
+    # The message mentions the input directory
+    assert str(tmp_path) in reports[-1][1]
+
+def test_get_input_reads_dict_from_dir_handles_multiple_samples(tmp_path):
+    # C paired
+    (tmp_path / "C_R1_.fastq.gz").write_text("c1")
+    (tmp_path / "C_R2_.fastq.gz").write_text("c2")
+    # D paired with different accepted tags
+    (tmp_path / "D-R1-.fq.gz").write_text("d1")
+    (tmp_path / "D-R2-.fq.gz").write_text("d2")
+
+    out = m.get_input_reads_dict_from_dir(tmp_path)
+    assert set(out.keys()) == {"C", "D"}
+    assert set(out["C"].keys()) == {"R1", "R2"}
+    assert set(out["D"].keys()) == {"R1", "R2"}
+
+
+def test_get_input_reads_dict_from_dir_skips_files_with_no_designation(tmp_path):
+    (tmp_path / "E.fastq.gz").write_text("no tag")
+    out = m.get_input_reads_dict_from_dir(tmp_path)
+    assert out == {}
+
+
+def test_parse_read_filename_uses_basename_not_path(tmp_path):
+    # Ensure directories in the path don’t confuse parsing
+    p = tmp_path / "subdir"
+    p.mkdir()
+    f = p / "sample.R1..fastq.gz"
+    f.write_text("x")
+    # give full path; function uses Path(...).name internally
+    samp, which = m.parse_read_filename(str(f))
+    assert samp == "sample" and which == "R1"
diff --git a/bit/tests/test_kraken2_to_taxon_summaries.py b/bit/tests/test_kraken2_to_taxon_summaries.py
@@ -0,0 +1,169 @@
+import pandas as pd
+import numpy as np
+import pytest
+from bit.modules import kraken2_to_taxon_summaries as k
+
+
+@pytest.fixture
+def report_min(tmp_path):
+    """
+    Tiny kraken2-like report covering:
+      - unclassified (U)
+      - root (R)
+      - domain given as R1 with Bacteria (to test normalize_rank_code)
+      - genus lines with spaces around names (to test stripping)
+      - one non-standard rank '-' which should produce a row but not change lineage
+    """
+    text = "\n".join([
+        # percent clade_reads taxon_reads rank taxid name(with indent)
+        "10.00 100 100 U 0 unclassified",
+        "90.00 900 50 R 1 root",
+        "80.00 800 0 R1 2 Bacteria",
+        "25.00 250 250 G 123    GenX",     # leading spaces
+        "15.00 150 150 G 124 GenY   ",     # trailing spaces
+        " 0.50   5   5 -  9999    12345_like_strain",  # non-standard rank
+        "",  # blank line should be ignored
+    ]) + "\n"
+    p = tmp_path / "k.report"
+    p.write_text(text)
+    return p
+
+
+def test_parse_report_line_basic_space_split():
+    line = "12.3 120 12 G 1234    Some Genus"
+    rec = k.parse_report_line(line)
+    assert rec == {
+        "clade_reads": 120,
+        "taxon_reads": 12,
+        "rank": "G",
+        "taxid": 1234,
+        "name": "Some Genus",
+    }
+
+def test_parse_report_line_tab_fallback():
+    line = "12.3\t120\t12\tG\t1234\t   Some\tGenus  "
+    rec = k.parse_report_line(line)
+    assert rec["taxid"] == 1234
+    assert rec["rank"] == "G"
+    assert rec["name"] == "Some\tGenus"
+
+def test_normalize_rank_code_domain_R1():
+    assert k.normalize_rank_code("R1", "Bacteria") == "D"
+    assert k.normalize_rank_code("R1", "Viruses") == "D"
+    assert k.normalize_rank_code("R1", "WeirdDomain") == "R1"
+    assert k.normalize_rank_code("G", "Genus") == "G"
+
+
+def test_parse_report_builds_lineages(report_min):
+    df = k.parse_report(str(report_min))
+
+    un = df[df["taxid"] == 0].iloc[0]
+    assert all(un[r] == "Unclassified" for r in k.STD_RANKS)
+    assert un["read_counts"] == 100
+
+    gx = df[df["taxid"] == 123].iloc[0]
+    gy = df[df["taxid"] == 124].iloc[0]
+    assert gx["genus"] == "GenX"
+    assert gy["genus"] == "GenY"
+    assert gx["domain"] == "Bacteria"
+    assert gy["domain"] == "Bacteria"
+
+    row_dash = df[df["taxid"] == 9999].iloc[0]
+    assert row_dash["domain"] == "Bacteria"
+    assert row_dash["domain"] != "Unclassified"
+
+
+def test_refine_df_aggregates_duplicates_and_computes_percents():
+    rows = [
+        {"taxid": 5, "domain":"Bacteria","phylum":"Firmicutes","class":"Bacilli","order":"NA","family":"NA","genus":"GenZ","species":"NA","read_counts":30},
+        {"taxid": 5, "domain":"Bacteria","phylum":"Firmicutes","class":"Bacilli","order":"NA","family":"NA","genus":"GenZ","species":"NA","read_counts":70},
+        {"taxid": 0, **{r:"Unclassified" for r in k.STD_RANKS}, "read_counts":100},
+    ]
+    df = pd.DataFrame(rows)
+    out = k.refine_df(df.copy())
+
+    agg = out[out["taxid"] == 5].iloc[0]
+    assert agg["read_counts"] == 100
+
+    z = out.set_index("taxid")["percent_of_reads"].to_dict()
+    assert np.isclose(z[5], 50.0, atol=1e-6)
+    assert np.isclose(z[0], 50.0, atol=1e-6)
+
+def test_refine_df_zero_total_reads():
+    rows = [
+        {"taxid": 1, **{r:"NA" for r in k.STD_RANKS}, "read_counts":0},
+        {"taxid": 2, **{r:"NA" for r in k.STD_RANKS}, "read_counts":0},
+    ]
+    df = pd.DataFrame(rows)
+    out = k.refine_df(df.copy())
+    # zero-read rows are dropped
+    assert out.empty
+
+
+def test_sort_df_custom_groups_and_stability():
+    # building rows to exercise sort groups:
+    #  - Unclassified (taxid=0) should come first
+    #  - All-NA lineage (group 1) second
+    #  - The rest (group 2) sorted by lineage then taxid (mergesort: stable)
+    rows = [
+        {"taxid": 3, "domain":"Bacteria","phylum":"Actino","class":"C1","order":"O1","family":"F1","genus":"G1","species":"S1","read_counts":10},
+        {"taxid": 0, **{r:"Unclassified" for r in k.STD_RANKS}, "read_counts":10},
+        {"taxid": 2, **{r:"NA" for r in k.STD_RANKS}, "read_counts":10},
+        {"taxid": 4, "domain":"Archaea","phylum":"Eury","class":"C1","order":"O1","family":"F1","genus":"G1","species":"S1","read_counts":10},
+        {"taxid": 5, "domain":"Archaea","phylum":"Eury","class":"C1","order":"O1","family":"F1","genus":"G1","species":"S1","read_counts":10},
+    ]
+    df = pd.DataFrame(rows)
+    out = k.sort_df(df.copy())
+
+    # expected order by sort_group then lineage then taxid:
+    # 0 first, then 2, then archaea 4,5 (taxid ascending), then bacteria 3
+    assert list(out["taxid"]) == [0, 2, 4, 5, 3]
+
+
+def test_kraken2_to_taxon_summaries_writes_tsv(tmp_path, report_min, monkeypatch):
+    # avoid touching the filesystem checker in unit tests
+    monkeypatch.setattr(k, "check_files_are_found", lambda paths: None)
+
+    out = tmp_path / "summary.tsv"
+    k.kraken2_to_taxon_summaries(str(report_min), str(out))
+
+    assert out.exists()
+
+    df = pd.read_csv(out, sep="\t")
+    expected_cols = ["taxid"] + k.STD_RANKS + ["read_counts", "percent_of_reads"]
+    assert list(df.columns) == expected_cols
+    assert (df["taxid"] == 0).any()
+    assert df["percent_of_reads"].map(lambda x: isinstance(x, float)).all()
+
+
+def test_parse_report_ignores_blank_and_handles_U_and_R1(tmp_path):
+    text = "\n".join([
+        "  ",  # blank
+        "5.0 50 50 U 0 unclassified",
+        "95.0 950 0 R1 2 Bacteria",
+        "20.0 200 200 G 9  GenA",
+    ])
+    p = tmp_path / "mini.report"
+    p.write_text(text)
+
+    df = k.parse_report(str(p))
+    assert (df["taxid"] == 0).any()
+    gen = df[df["taxid"] == 9].iloc[0]
+    assert gen["domain"] == "Bacteria"
+
+
+def test_preflight_checks_calls_validator(monkeypatch):
+    called = {}
+    monkeypatch.setattr(k, "check_files_are_found", lambda paths: called.setdefault("ok", paths))
+    k.preflight_checks("abc.txt")
+    assert "ok" in called and called["ok"] == ["abc.txt"]
+
+
+def test_parse_report_line_bad_line_uses_report_failure(monkeypatch):
+    msgs = {}
+    def fake_report_failure(msg):
+        raise ValueError(msg)
+    monkeypatch.setattr(k, "report_failure", fake_report_failure)
+
+    with pytest.raises(ValueError):
+        k.parse_report_line("not enough fields")

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,3 @@`
`1`		`-import io`
`2`	`1`	`from pathlib import Path`
`3`	`2`	`import pandas as pd`
`4`	`3`	`import pytest`