Skip to content

Commit 59eaf7a

Browse files
test: add unit and integration tests for pipeline scripts
Add unit tests for compute_odds_ratios, compute_seq_similarity, convert_to_trna_coords, generate_manifest, get_align_stats, and get_bcerror_freqs. Add integration tests validating pipeline outputs. Update conftest.py with shared fixtures. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d01d0d0 commit 59eaf7a

File tree

8 files changed

+993
-15
lines changed

8 files changed

+993
-15
lines changed

tests/conftest.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,10 @@
1111
import pysam
1212
import pytest
1313

14-
# Add workflow/scripts to path for imports
14+
# Add workflow/scripts and tests/ to path for imports
1515
REPO_ROOT = Path(__file__).parent.parent
1616
sys.path.insert(0, str(REPO_ROOT / "workflow" / "scripts"))
17+
sys.path.insert(0, str(Path(__file__).parent))
1718

1819
# Test data paths
1920
TEST_DATA_DIR = REPO_ROOT / ".tests"
@@ -194,7 +195,7 @@ def test_outputs_available():
194195
@pytest.fixture
195196
def test_final_bam():
196197
"""Path to pre-computed final BAM with all tags."""
197-
bam_path = TEST_OUTPUTS_DIR / "bam" / "final" / "sample1.bam"
198+
bam_path = TEST_OUTPUTS_DIR / "bam" / "final" / "sample1" / "sample1.bam"
198199
if bam_path.exists():
199200
return bam_path
200201
pytest.skip("Pre-computed test outputs not available. Run 'pixi run dl-test-data'")

tests/integration/test_output_validation.py

Lines changed: 298 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
"""
77

88
import gzip
9+
import json
910
import pandas as pd
1011
import pysam
1112
import pytest
@@ -33,7 +34,7 @@ class TestFinalBamOutput:
3334
@pytest.fixture
3435
def final_bam_path(self):
3536
"""Get path to sample1 final BAM."""
36-
path = TEST_OUTPUTS_DIR / "bam" / "final" / "sample1.bam"
37+
path = TEST_OUTPUTS_DIR / "bam" / "final" / "sample1" / "sample1.bam"
3738
if not path.exists():
3839
pytest.skip("Final BAM not found")
3940
return path
@@ -56,7 +57,7 @@ def test_final_bam_has_cl_tag(self, final_bam_path):
5657
reads_with_cl += 1
5758
# CL should be in valid range
5859
cl_val = read.get_tag("CL")
59-
if isinstance(cl_val, (list, tuple)):
60+
if not isinstance(cl_val, (int, float)):
6061
cl_val = cl_val[0]
6162
assert 0 <= cl_val <= 255
6263

@@ -191,7 +192,7 @@ def align_stats_path(self):
191192
/ "summary"
192193
/ "tables"
193194
/ "sample1"
194-
/ "sample1.align_stats.txt"
195+
/ "sample1.align_stats.tsv.gz"
195196
)
196197
if not path.exists():
197198
pytest.skip("Alignment stats not found")
@@ -200,21 +201,21 @@ def align_stats_path(self):
200201
def test_align_stats_exists(self, align_stats_path):
201202
"""Alignment stats file should exist and have content."""
202203
assert align_stats_path.exists()
203-
content = align_stats_path.read_text()
204-
assert len(content) > 0
204+
df = pd.read_csv(align_stats_path, sep="\t")
205+
assert len(df) > 0
205206

206207
def test_align_stats_has_key_metrics(self, align_stats_path):
207-
"""Alignment stats should contain expected metrics."""
208-
content = align_stats_path.read_text()
208+
"""Alignment stats should contain expected columns."""
209+
df = pd.read_csv(align_stats_path, sep="\t")
209210

210-
# Check for expected metric names
211-
expected_metrics = [
212-
"total_alignments",
213-
"passed_alignments",
211+
expected_columns = [
212+
"n_reads",
213+
"pct_mapped",
214+
"mapped_reads",
214215
]
215216

216-
for metric in expected_metrics:
217-
assert metric in content, f"Missing metric: {metric}"
217+
for col in expected_columns:
218+
assert col in df.columns, f"Missing column: {col}"
218219

219220

220221
@pytest.mark.skipif(
@@ -249,3 +250,287 @@ def test_validated_reference_is_fasta(self, validated_ref_path):
249250
lines = content.strip().split("\n")
250251
seq_lines = [l for l in lines if not l.startswith(">")]
251252
assert len(seq_lines) > 0
253+
254+
255+
@pytest.mark.skipif(
256+
not (TEST_OUTPUTS_DIR / "summary" / "tables").exists(),
257+
reason="Pre-computed outputs not available",
258+
)
259+
class TestBaseCallingErrorOutput:
260+
@pytest.fixture
261+
def bcerror_path(self):
262+
path = TEST_OUTPUTS_DIR / "summary" / "tables" / "sample1" / "sample1.bcerror.tsv.gz"
263+
if not path.exists():
264+
pytest.skip("bcerror output not found")
265+
return path
266+
267+
def test_bcerror_columns(self, bcerror_path):
268+
df = pd.read_csv(bcerror_path, sep="\t")
269+
for col in ["Reference", "Position", "Spanning_Reads", "MismatchFreq",
270+
"InsertionFreq", "DeletionFreq", "BCErrorFreq"]:
271+
assert col in df.columns, f"Missing column: {col}"
272+
273+
def test_freq_values_in_range(self, bcerror_path):
274+
df = pd.read_csv(bcerror_path, sep="\t")
275+
for col in ["MismatchFreq", "InsertionFreq", "DeletionFreq", "BCErrorFreq"]:
276+
assert df[col].min() >= 0, f"{col} has negative values"
277+
assert df[col].max() <= 1, f"{col} has values > 1"
278+
279+
def test_positions_1_indexed(self, bcerror_path):
280+
df = pd.read_csv(bcerror_path, sep="\t")
281+
assert df["Position"].min() >= 1
282+
283+
284+
@pytest.mark.skipif(
285+
not (TEST_OUTPUTS_DIR / "summary" / "modkit").exists(),
286+
reason="Pre-computed outputs not available",
287+
)
288+
class TestModkitOutputs:
289+
@pytest.fixture
290+
def pileup_path(self):
291+
path = TEST_OUTPUTS_DIR / "summary" / "modkit" / "sample1" / "sample1.pileup.bed.gz"
292+
if not path.exists():
293+
pytest.skip("Pileup bed not found")
294+
return path
295+
296+
@pytest.fixture
297+
def mod_calls_path(self):
298+
path = TEST_OUTPUTS_DIR / "summary" / "modkit" / "sample1" / "sample1.mod_calls.tsv.gz"
299+
if not path.exists():
300+
pytest.skip("mod_calls not found")
301+
return path
302+
303+
@pytest.fixture
304+
def mod_full_path(self):
305+
path = TEST_OUTPUTS_DIR / "summary" / "modkit" / "sample1" / "sample1.mod_full.tsv.gz"
306+
if not path.exists():
307+
pytest.skip("mod_full not found")
308+
return path
309+
310+
def test_pileup_bedmethyl_format(self, pileup_path):
311+
df = pd.read_csv(pileup_path, sep="\t", header=None)
312+
assert df.shape[1] >= 10, "bedMethyl should have at least 10 columns"
313+
assert len(df) > 0
314+
315+
def test_mod_calls_has_required_columns(self, mod_calls_path):
316+
df = pd.read_csv(mod_calls_path, sep="\t", nrows=5)
317+
for col in ["read_id", "ref_position", "chrom", "call_code"]:
318+
assert col in df.columns, f"Missing column: {col}"
319+
320+
def test_mod_full_has_header(self, mod_full_path):
321+
df = pd.read_csv(mod_full_path, sep="\t", nrows=5)
322+
assert "read_id" in df.columns
323+
assert len(df) > 0
324+
325+
326+
@pytest.mark.skipif(
327+
not (TEST_OUTPUTS_DIR / "summary" / "tables").exists(),
328+
reason="Pre-computed outputs not available",
329+
)
330+
class TestCoverageBedgraphOutput:
331+
@pytest.fixture
332+
def counts_path(self):
333+
path = TEST_OUTPUTS_DIR / "summary" / "tables" / "sample1" / "sample1.counts.bg.gz"
334+
if not path.exists():
335+
pytest.skip("counts bedgraph not found")
336+
return path
337+
338+
@pytest.fixture
339+
def cpm_path(self):
340+
path = TEST_OUTPUTS_DIR / "summary" / "tables" / "sample1" / "sample1.cpm.bg.gz"
341+
if not path.exists():
342+
pytest.skip("cpm bedgraph not found")
343+
return path
344+
345+
def test_bedgraph_4_columns(self, counts_path):
346+
df = pd.read_csv(counts_path, sep="\t", header=None)
347+
assert df.shape[1] == 4, "bedGraph should have exactly 4 columns"
348+
349+
def test_values_non_negative(self, counts_path, cpm_path):
350+
for path in [counts_path, cpm_path]:
351+
df = pd.read_csv(path, sep="\t", header=None)
352+
assert df.iloc[:, 3].min() >= 0
353+
354+
def test_positions_0_based(self, counts_path):
355+
df = pd.read_csv(counts_path, sep="\t", header=None)
356+
assert df.iloc[:, 1].min() >= 0
357+
358+
def test_same_chroms_in_both(self, counts_path, cpm_path):
359+
counts_df = pd.read_csv(counts_path, sep="\t", header=None)
360+
cpm_df = pd.read_csv(cpm_path, sep="\t", header=None)
361+
assert set(counts_df.iloc[:, 0]) == set(cpm_df.iloc[:, 0])
362+
363+
364+
@pytest.mark.skipif(
365+
not (TEST_OUTPUTS_DIR / "manifest.json").exists(),
366+
reason="Pre-computed outputs not available",
367+
)
368+
class TestManifestOutput:
369+
@pytest.fixture
370+
def manifest(self):
371+
path = TEST_OUTPUTS_DIR / "manifest.json"
372+
with open(path) as f:
373+
return json.load(f)
374+
375+
def test_valid_json(self, manifest):
376+
assert isinstance(manifest, dict)
377+
378+
def test_required_top_level_keys(self, manifest):
379+
for key in ["manifest_version", "pipeline", "execution", "config", "samples", "tools"]:
380+
assert key in manifest, f"Missing key: {key}"
381+
382+
def test_status_success(self, manifest):
383+
assert manifest["execution"]["status"] == "success"
384+
385+
def test_sample_count(self, manifest):
386+
assert manifest["samples"]["count"] == 2
387+
388+
389+
@pytest.mark.skipif(
390+
not (TEST_OUTPUTS_DIR / "squiggy-session.json").exists(),
391+
reason="Pre-computed outputs not available",
392+
)
393+
class TestSquiggySessionOutput:
394+
@pytest.fixture
395+
def session(self):
396+
path = TEST_OUTPUTS_DIR / "squiggy-session.json"
397+
with open(path) as f:
398+
return json.load(f)
399+
400+
def test_valid_json(self, session):
401+
assert isinstance(session, dict)
402+
403+
def test_has_samples(self, session):
404+
samples = session.get("samples", {})
405+
assert "sample1" in samples
406+
assert "sample2" in samples
407+
408+
def test_sample_has_paths(self, session):
409+
for sample_name in ["sample1", "sample2"]:
410+
sample = session["samples"][sample_name]
411+
assert "bamPath" in sample
412+
assert "pod5Paths" in sample
413+
assert "fastaPath" in sample
414+
415+
416+
@pytest.mark.skipif(
417+
not (TEST_OUTPUTS_DIR / "summary" / "qc" / "reference_similarity.tsv").exists(),
418+
reason="Pre-computed outputs not available",
419+
)
420+
class TestReferenceSimilarityOutput:
421+
@pytest.fixture
422+
def sim_matrix(self):
423+
path = TEST_OUTPUTS_DIR / "summary" / "qc" / "reference_similarity.tsv"
424+
return pd.read_csv(path, sep="\t", index_col=0)
425+
426+
def test_square_matrix(self, sim_matrix):
427+
assert sim_matrix.shape[0] == sim_matrix.shape[1]
428+
429+
def test_diagonal_100(self, sim_matrix):
430+
for i in range(sim_matrix.shape[0]):
431+
assert sim_matrix.iloc[i, i] == pytest.approx(100.0)
432+
433+
def test_symmetric(self, sim_matrix):
434+
for i in range(sim_matrix.shape[0]):
435+
for j in range(i + 1, sim_matrix.shape[1]):
436+
assert sim_matrix.iloc[i, j] == pytest.approx(sim_matrix.iloc[j, i], abs=0.01)
437+
438+
def test_values_in_range(self, sim_matrix):
439+
assert sim_matrix.min().min() >= 0
440+
assert sim_matrix.max().max() <= 100
441+
442+
443+
@pytest.mark.skipif(
444+
not (TEST_OUTPUTS_DIR / "reference" / "trna_only.fa").exists(),
445+
reason="Pre-computed outputs not available",
446+
)
447+
class TestTrnaOnlyReference:
448+
@pytest.fixture
449+
def trna_only_path(self):
450+
return TEST_OUTPUTS_DIR / "reference" / "trna_only.fa"
451+
452+
def test_valid_fasta(self, trna_only_path):
453+
with open(trna_only_path) as f:
454+
content = f.read()
455+
assert content.startswith(">")
456+
seq_lines = [l for l in content.strip().split("\n") if not l.startswith(">")]
457+
assert len(seq_lines) > 0
458+
459+
def test_no_adapter_substrings(self, trna_only_path):
460+
adapter_5p = "CCTAAGAGCAAGAAGAAGCCTGG"
461+
adapter_3p_prefix = "GGCTTCTTCTTGCTCTT"
462+
with open(trna_only_path) as f:
463+
for line in f:
464+
if line.startswith(">"):
465+
continue
466+
seq = line.strip().upper()
467+
assert adapter_5p not in seq, "Found 5' adapter in tRNA-only reference"
468+
assert adapter_3p_prefix not in seq, "Found 3' adapter in tRNA-only reference"
469+
470+
def test_sequences_end_with_cca(self, trna_only_path):
471+
sequences = {}
472+
current = None
473+
with open(trna_only_path) as f:
474+
for line in f:
475+
line = line.strip()
476+
if line.startswith(">"):
477+
current = line[1:].split()[0]
478+
sequences[current] = ""
479+
else:
480+
sequences[current] += line.upper()
481+
for name, seq in sequences.items():
482+
assert seq.endswith("CCA"), f"{name} does not end with CCA: ...{seq[-5:]}"
483+
484+
485+
@pytest.mark.skipif(
486+
not (TEST_OUTPUTS_DIR / "bam" / "final").exists()
487+
or not (TEST_OUTPUTS_DIR / "summary" / "tables").exists(),
488+
reason="Pre-computed outputs not available",
489+
)
490+
class TestMultiSampleConsistency:
491+
SAMPLES = ["sample1", "sample2"]
492+
493+
def test_both_samples_have_final_bam(self):
494+
for sample in self.SAMPLES:
495+
path = TEST_OUTPUTS_DIR / "bam" / "final" / sample / f"{sample}.bam"
496+
assert path.exists(), f"Missing final BAM for {sample}"
497+
498+
def test_both_samples_have_charging_tables(self):
499+
for sample in self.SAMPLES:
500+
path = (
501+
TEST_OUTPUTS_DIR / "summary" / "tables" / sample
502+
/ f"{sample}.charging_prob.tsv.gz"
503+
)
504+
assert path.exists(), f"Missing charging table for {sample}"
505+
506+
def test_both_samples_have_bcerror(self):
507+
for sample in self.SAMPLES:
508+
path = (
509+
TEST_OUTPUTS_DIR / "summary" / "tables" / sample
510+
/ f"{sample}.bcerror.tsv.gz"
511+
)
512+
assert path.exists(), f"Missing bcerror for {sample}"
513+
514+
def test_both_samples_have_modkit(self):
515+
for sample in self.SAMPLES:
516+
path = (
517+
TEST_OUTPUTS_DIR / "summary" / "modkit" / sample
518+
/ f"{sample}.mod_calls.tsv.gz"
519+
)
520+
assert path.exists(), f"Missing modkit output for {sample}"
521+
522+
def test_column_names_match_between_samples(self):
523+
for table in ["bcerror", "charging_prob", "charging.cpm", "align_stats"]:
524+
dfs = {}
525+
for sample in self.SAMPLES:
526+
path = (
527+
TEST_OUTPUTS_DIR / "summary" / "tables" / sample
528+
/ f"{sample}.{table}.tsv.gz"
529+
)
530+
if not path.exists():
531+
pytest.skip(f"{table} not found for {sample}")
532+
dfs[sample] = pd.read_csv(path, sep="\t", nrows=0)
533+
534+
cols1 = set(dfs["sample1"].columns)
535+
cols2 = set(dfs["sample2"].columns)
536+
assert cols1 == cols2, f"Column mismatch in {table}: {cols1.symmetric_difference(cols2)}"

0 commit comments

Comments
 (0)