|
| 1 | +import pandas as pd |
| 2 | +import numpy as np |
| 3 | +import pytest |
| 4 | +from bit.modules import kraken2_to_taxon_summaries as k |
| 5 | + |
| 6 | + |
| 7 | +@pytest.fixture |
| 8 | +def report_min(tmp_path): |
| 9 | + """ |
| 10 | + Tiny kraken2-like report covering: |
| 11 | + - unclassified (U) |
| 12 | + - root (R) |
| 13 | + - domain given as R1 with Bacteria (to test normalize_rank_code) |
| 14 | + - genus lines with spaces around names (to test stripping) |
| 15 | + - one non-standard rank '-' which should produce a row but not change lineage |
| 16 | + """ |
| 17 | + text = "\n".join([ |
| 18 | + # percent clade_reads taxon_reads rank taxid name(with indent) |
| 19 | + "10.00 100 100 U 0 unclassified", |
| 20 | + "90.00 900 50 R 1 root", |
| 21 | + "80.00 800 0 R1 2 Bacteria", |
| 22 | + "25.00 250 250 G 123 GenX", # leading spaces |
| 23 | + "15.00 150 150 G 124 GenY ", # trailing spaces |
| 24 | + " 0.50 5 5 - 9999 12345_like_strain", # non-standard rank |
| 25 | + "", # blank line should be ignored |
| 26 | + ]) + "\n" |
| 27 | + p = tmp_path / "k.report" |
| 28 | + p.write_text(text) |
| 29 | + return p |
| 30 | + |
| 31 | + |
| 32 | +def test_parse_report_line_basic_space_split(): |
| 33 | + line = "12.3 120 12 G 1234 Some Genus" |
| 34 | + rec = k.parse_report_line(line) |
| 35 | + assert rec == { |
| 36 | + "clade_reads": 120, |
| 37 | + "taxon_reads": 12, |
| 38 | + "rank": "G", |
| 39 | + "taxid": 1234, |
| 40 | + "name": "Some Genus", |
| 41 | + } |
| 42 | + |
| 43 | +def test_parse_report_line_tab_fallback(): |
| 44 | + line = "12.3\t120\t12\tG\t1234\t Some\tGenus " |
| 45 | + rec = k.parse_report_line(line) |
| 46 | + assert rec["taxid"] == 1234 |
| 47 | + assert rec["rank"] == "G" |
| 48 | + assert rec["name"] == "Some\tGenus" |
| 49 | + |
| 50 | +def test_normalize_rank_code_domain_R1(): |
| 51 | + assert k.normalize_rank_code("R1", "Bacteria") == "D" |
| 52 | + assert k.normalize_rank_code("R1", "Viruses") == "D" |
| 53 | + assert k.normalize_rank_code("R1", "WeirdDomain") == "R1" |
| 54 | + assert k.normalize_rank_code("G", "Genus") == "G" |
| 55 | + |
| 56 | + |
| 57 | +def test_parse_report_builds_lineages(report_min): |
| 58 | + df = k.parse_report(str(report_min)) |
| 59 | + |
| 60 | + un = df[df["taxid"] == 0].iloc[0] |
| 61 | + assert all(un[r] == "Unclassified" for r in k.STD_RANKS) |
| 62 | + assert un["read_counts"] == 100 |
| 63 | + |
| 64 | + gx = df[df["taxid"] == 123].iloc[0] |
| 65 | + gy = df[df["taxid"] == 124].iloc[0] |
| 66 | + assert gx["genus"] == "GenX" |
| 67 | + assert gy["genus"] == "GenY" |
| 68 | + assert gx["domain"] == "Bacteria" |
| 69 | + assert gy["domain"] == "Bacteria" |
| 70 | + |
| 71 | + row_dash = df[df["taxid"] == 9999].iloc[0] |
| 72 | + assert row_dash["domain"] == "Bacteria" |
| 73 | + assert row_dash["domain"] != "Unclassified" |
| 74 | + |
| 75 | + |
| 76 | +def test_refine_df_aggregates_duplicates_and_computes_percents(): |
| 77 | + rows = [ |
| 78 | + {"taxid": 5, "domain":"Bacteria","phylum":"Firmicutes","class":"Bacilli","order":"NA","family":"NA","genus":"GenZ","species":"NA","read_counts":30}, |
| 79 | + {"taxid": 5, "domain":"Bacteria","phylum":"Firmicutes","class":"Bacilli","order":"NA","family":"NA","genus":"GenZ","species":"NA","read_counts":70}, |
| 80 | + {"taxid": 0, **{r:"Unclassified" for r in k.STD_RANKS}, "read_counts":100}, |
| 81 | + ] |
| 82 | + df = pd.DataFrame(rows) |
| 83 | + out = k.refine_df(df.copy()) |
| 84 | + |
| 85 | + agg = out[out["taxid"] == 5].iloc[0] |
| 86 | + assert agg["read_counts"] == 100 |
| 87 | + |
| 88 | + z = out.set_index("taxid")["percent_of_reads"].to_dict() |
| 89 | + assert np.isclose(z[5], 50.0, atol=1e-6) |
| 90 | + assert np.isclose(z[0], 50.0, atol=1e-6) |
| 91 | + |
| 92 | +def test_refine_df_zero_total_reads(): |
| 93 | + rows = [ |
| 94 | + {"taxid": 1, **{r:"NA" for r in k.STD_RANKS}, "read_counts":0}, |
| 95 | + {"taxid": 2, **{r:"NA" for r in k.STD_RANKS}, "read_counts":0}, |
| 96 | + ] |
| 97 | + df = pd.DataFrame(rows) |
| 98 | + out = k.refine_df(df.copy()) |
| 99 | + # zero-read rows are dropped |
| 100 | + assert out.empty |
| 101 | + |
| 102 | + |
| 103 | +def test_sort_df_custom_groups_and_stability(): |
| 104 | + # building rows to exercise sort groups: |
| 105 | + # - Unclassified (taxid=0) should come first |
| 106 | + # - All-NA lineage (group 1) second |
| 107 | + # - The rest (group 2) sorted by lineage then taxid (mergesort: stable) |
| 108 | + rows = [ |
| 109 | + {"taxid": 3, "domain":"Bacteria","phylum":"Actino","class":"C1","order":"O1","family":"F1","genus":"G1","species":"S1","read_counts":10}, |
| 110 | + {"taxid": 0, **{r:"Unclassified" for r in k.STD_RANKS}, "read_counts":10}, |
| 111 | + {"taxid": 2, **{r:"NA" for r in k.STD_RANKS}, "read_counts":10}, |
| 112 | + {"taxid": 4, "domain":"Archaea","phylum":"Eury","class":"C1","order":"O1","family":"F1","genus":"G1","species":"S1","read_counts":10}, |
| 113 | + {"taxid": 5, "domain":"Archaea","phylum":"Eury","class":"C1","order":"O1","family":"F1","genus":"G1","species":"S1","read_counts":10}, |
| 114 | + ] |
| 115 | + df = pd.DataFrame(rows) |
| 116 | + out = k.sort_df(df.copy()) |
| 117 | + |
| 118 | + # expected order by sort_group then lineage then taxid: |
| 119 | + # 0 first, then 2, then archaea 4,5 (taxid ascending), then bacteria 3 |
| 120 | + assert list(out["taxid"]) == [0, 2, 4, 5, 3] |
| 121 | + |
| 122 | + |
| 123 | +def test_kraken2_to_taxon_summaries_writes_tsv(tmp_path, report_min, monkeypatch): |
| 124 | + # avoid touching the filesystem checker in unit tests |
| 125 | + monkeypatch.setattr(k, "check_files_are_found", lambda paths: None) |
| 126 | + |
| 127 | + out = tmp_path / "summary.tsv" |
| 128 | + k.kraken2_to_taxon_summaries(str(report_min), str(out)) |
| 129 | + |
| 130 | + assert out.exists() |
| 131 | + |
| 132 | + df = pd.read_csv(out, sep="\t") |
| 133 | + expected_cols = ["taxid"] + k.STD_RANKS + ["read_counts", "percent_of_reads"] |
| 134 | + assert list(df.columns) == expected_cols |
| 135 | + assert (df["taxid"] == 0).any() |
| 136 | + assert df["percent_of_reads"].map(lambda x: isinstance(x, float)).all() |
| 137 | + |
| 138 | + |
| 139 | +def test_parse_report_ignores_blank_and_handles_U_and_R1(tmp_path): |
| 140 | + text = "\n".join([ |
| 141 | + " ", # blank |
| 142 | + "5.0 50 50 U 0 unclassified", |
| 143 | + "95.0 950 0 R1 2 Bacteria", |
| 144 | + "20.0 200 200 G 9 GenA", |
| 145 | + ]) |
| 146 | + p = tmp_path / "mini.report" |
| 147 | + p.write_text(text) |
| 148 | + |
| 149 | + df = k.parse_report(str(p)) |
| 150 | + assert (df["taxid"] == 0).any() |
| 151 | + gen = df[df["taxid"] == 9].iloc[0] |
| 152 | + assert gen["domain"] == "Bacteria" |
| 153 | + |
| 154 | + |
| 155 | +def test_preflight_checks_calls_validator(monkeypatch): |
| 156 | + called = {} |
| 157 | + monkeypatch.setattr(k, "check_files_are_found", lambda paths: called.setdefault("ok", paths)) |
| 158 | + k.preflight_checks("abc.txt") |
| 159 | + assert "ok" in called and called["ok"] == ["abc.txt"] |
| 160 | + |
| 161 | + |
| 162 | +def test_parse_report_line_bad_line_uses_report_failure(monkeypatch): |
| 163 | + msgs = {} |
| 164 | + def fake_report_failure(msg): |
| 165 | + raise ValueError(msg) |
| 166 | + monkeypatch.setattr(k, "report_failure", fake_report_failure) |
| 167 | + |
| 168 | + with pytest.raises(ValueError): |
| 169 | + k.parse_report_line("not enough fields") |
0 commit comments