66"""
77
88import gzip
9+ import json
910import pandas as pd
1011import pysam
1112import pytest
@@ -33,7 +34,7 @@ class TestFinalBamOutput:
3334 @pytest .fixture
3435 def final_bam_path (self ):
3536 """Get path to sample1 final BAM."""
36- path = TEST_OUTPUTS_DIR / "bam" / "final" / "sample1.bam"
37+ path = TEST_OUTPUTS_DIR / "bam" / "final" / "sample1" / "sample1 .bam"
3738 if not path .exists ():
3839 pytest .skip ("Final BAM not found" )
3940 return path
@@ -56,7 +57,7 @@ def test_final_bam_has_cl_tag(self, final_bam_path):
5657 reads_with_cl += 1
5758 # CL should be in valid range
5859 cl_val = read .get_tag ("CL" )
59- if isinstance (cl_val , (list , tuple )):
60+ if not isinstance (cl_val , (int , float )):
6061 cl_val = cl_val [0 ]
6162 assert 0 <= cl_val <= 255
6263
@@ -191,7 +192,7 @@ def align_stats_path(self):
191192 / "summary"
192193 / "tables"
193194 / "sample1"
194- / "sample1.align_stats.txt "
195+ / "sample1.align_stats.tsv.gz "
195196 )
196197 if not path .exists ():
197198 pytest .skip ("Alignment stats not found" )
@@ -200,21 +201,21 @@ def align_stats_path(self):
200201 def test_align_stats_exists (self , align_stats_path ):
201202 """Alignment stats file should exist and have content."""
202203 assert align_stats_path .exists ()
203- content = align_stats_path . read_text ( )
204- assert len (content ) > 0
204+ df = pd . read_csv ( align_stats_path , sep = " \t " )
205+ assert len (df ) > 0
205206
206207 def test_align_stats_has_key_metrics (self , align_stats_path ):
207- """Alignment stats should contain expected metrics ."""
208- content = align_stats_path . read_text ( )
208+ """Alignment stats should contain expected columns ."""
209+ df = pd . read_csv ( align_stats_path , sep = " \t " )
209210
210- # Check for expected metric names
211- expected_metrics = [
212- "total_alignments " ,
213- "passed_alignments " ,
211+ expected_columns = [
212+ "n_reads" ,
213+ "pct_mapped " ,
214+ "mapped_reads " ,
214215 ]
215216
216- for metric in expected_metrics :
217- assert metric in content , f"Missing metric : { metric } "
217+ for col in expected_columns :
218+ assert col in df . columns , f"Missing column : { col } "
218219
219220
220221@pytest .mark .skipif (
@@ -249,3 +250,287 @@ def test_validated_reference_is_fasta(self, validated_ref_path):
249250 lines = content .strip ().split ("\n " )
250251 seq_lines = [l for l in lines if not l .startswith (">" )]
251252 assert len (seq_lines ) > 0
253+
254+
255+ @pytest .mark .skipif (
256+ not (TEST_OUTPUTS_DIR / "summary" / "tables" ).exists (),
257+ reason = "Pre-computed outputs not available" ,
258+ )
259+ class TestBaseCallingErrorOutput :
260+ @pytest .fixture
261+ def bcerror_path (self ):
262+ path = TEST_OUTPUTS_DIR / "summary" / "tables" / "sample1" / "sample1.bcerror.tsv.gz"
263+ if not path .exists ():
264+ pytest .skip ("bcerror output not found" )
265+ return path
266+
267+ def test_bcerror_columns (self , bcerror_path ):
268+ df = pd .read_csv (bcerror_path , sep = "\t " )
269+ for col in ["Reference" , "Position" , "Spanning_Reads" , "MismatchFreq" ,
270+ "InsertionFreq" , "DeletionFreq" , "BCErrorFreq" ]:
271+ assert col in df .columns , f"Missing column: { col } "
272+
273+ def test_freq_values_in_range (self , bcerror_path ):
274+ df = pd .read_csv (bcerror_path , sep = "\t " )
275+ for col in ["MismatchFreq" , "InsertionFreq" , "DeletionFreq" , "BCErrorFreq" ]:
276+ assert df [col ].min () >= 0 , f"{ col } has negative values"
277+ assert df [col ].max () <= 1 , f"{ col } has values > 1"
278+
279+ def test_positions_1_indexed (self , bcerror_path ):
280+ df = pd .read_csv (bcerror_path , sep = "\t " )
281+ assert df ["Position" ].min () >= 1
282+
283+
284+ @pytest .mark .skipif (
285+ not (TEST_OUTPUTS_DIR / "summary" / "modkit" ).exists (),
286+ reason = "Pre-computed outputs not available" ,
287+ )
288+ class TestModkitOutputs :
289+ @pytest .fixture
290+ def pileup_path (self ):
291+ path = TEST_OUTPUTS_DIR / "summary" / "modkit" / "sample1" / "sample1.pileup.bed.gz"
292+ if not path .exists ():
293+ pytest .skip ("Pileup bed not found" )
294+ return path
295+
296+ @pytest .fixture
297+ def mod_calls_path (self ):
298+ path = TEST_OUTPUTS_DIR / "summary" / "modkit" / "sample1" / "sample1.mod_calls.tsv.gz"
299+ if not path .exists ():
300+ pytest .skip ("mod_calls not found" )
301+ return path
302+
303+ @pytest .fixture
304+ def mod_full_path (self ):
305+ path = TEST_OUTPUTS_DIR / "summary" / "modkit" / "sample1" / "sample1.mod_full.tsv.gz"
306+ if not path .exists ():
307+ pytest .skip ("mod_full not found" )
308+ return path
309+
310+ def test_pileup_bedmethyl_format (self , pileup_path ):
311+ df = pd .read_csv (pileup_path , sep = "\t " , header = None )
312+ assert df .shape [1 ] >= 10 , "bedMethyl should have at least 10 columns"
313+ assert len (df ) > 0
314+
315+ def test_mod_calls_has_required_columns (self , mod_calls_path ):
316+ df = pd .read_csv (mod_calls_path , sep = "\t " , nrows = 5 )
317+ for col in ["read_id" , "ref_position" , "chrom" , "call_code" ]:
318+ assert col in df .columns , f"Missing column: { col } "
319+
320+ def test_mod_full_has_header (self , mod_full_path ):
321+ df = pd .read_csv (mod_full_path , sep = "\t " , nrows = 5 )
322+ assert "read_id" in df .columns
323+ assert len (df ) > 0
324+
325+
326+ @pytest .mark .skipif (
327+ not (TEST_OUTPUTS_DIR / "summary" / "tables" ).exists (),
328+ reason = "Pre-computed outputs not available" ,
329+ )
330+ class TestCoverageBedgraphOutput :
331+ @pytest .fixture
332+ def counts_path (self ):
333+ path = TEST_OUTPUTS_DIR / "summary" / "tables" / "sample1" / "sample1.counts.bg.gz"
334+ if not path .exists ():
335+ pytest .skip ("counts bedgraph not found" )
336+ return path
337+
338+ @pytest .fixture
339+ def cpm_path (self ):
340+ path = TEST_OUTPUTS_DIR / "summary" / "tables" / "sample1" / "sample1.cpm.bg.gz"
341+ if not path .exists ():
342+ pytest .skip ("cpm bedgraph not found" )
343+ return path
344+
345+ def test_bedgraph_4_columns (self , counts_path ):
346+ df = pd .read_csv (counts_path , sep = "\t " , header = None )
347+ assert df .shape [1 ] == 4 , "bedGraph should have exactly 4 columns"
348+
349+ def test_values_non_negative (self , counts_path , cpm_path ):
350+ for path in [counts_path , cpm_path ]:
351+ df = pd .read_csv (path , sep = "\t " , header = None )
352+ assert df .iloc [:, 3 ].min () >= 0
353+
354+ def test_positions_0_based (self , counts_path ):
355+ df = pd .read_csv (counts_path , sep = "\t " , header = None )
356+ assert df .iloc [:, 1 ].min () >= 0
357+
358+ def test_same_chroms_in_both (self , counts_path , cpm_path ):
359+ counts_df = pd .read_csv (counts_path , sep = "\t " , header = None )
360+ cpm_df = pd .read_csv (cpm_path , sep = "\t " , header = None )
361+ assert set (counts_df .iloc [:, 0 ]) == set (cpm_df .iloc [:, 0 ])
362+
363+
364+ @pytest .mark .skipif (
365+ not (TEST_OUTPUTS_DIR / "manifest.json" ).exists (),
366+ reason = "Pre-computed outputs not available" ,
367+ )
368+ class TestManifestOutput :
369+ @pytest .fixture
370+ def manifest (self ):
371+ path = TEST_OUTPUTS_DIR / "manifest.json"
372+ with open (path ) as f :
373+ return json .load (f )
374+
375+ def test_valid_json (self , manifest ):
376+ assert isinstance (manifest , dict )
377+
378+ def test_required_top_level_keys (self , manifest ):
379+ for key in ["manifest_version" , "pipeline" , "execution" , "config" , "samples" , "tools" ]:
380+ assert key in manifest , f"Missing key: { key } "
381+
382+ def test_status_success (self , manifest ):
383+ assert manifest ["execution" ]["status" ] == "success"
384+
385+ def test_sample_count (self , manifest ):
386+ assert manifest ["samples" ]["count" ] == 2
387+
388+
389+ @pytest .mark .skipif (
390+ not (TEST_OUTPUTS_DIR / "squiggy-session.json" ).exists (),
391+ reason = "Pre-computed outputs not available" ,
392+ )
393+ class TestSquiggySessionOutput :
394+ @pytest .fixture
395+ def session (self ):
396+ path = TEST_OUTPUTS_DIR / "squiggy-session.json"
397+ with open (path ) as f :
398+ return json .load (f )
399+
400+ def test_valid_json (self , session ):
401+ assert isinstance (session , dict )
402+
403+ def test_has_samples (self , session ):
404+ samples = session .get ("samples" , {})
405+ assert "sample1" in samples
406+ assert "sample2" in samples
407+
408+ def test_sample_has_paths (self , session ):
409+ for sample_name in ["sample1" , "sample2" ]:
410+ sample = session ["samples" ][sample_name ]
411+ assert "bamPath" in sample
412+ assert "pod5Paths" in sample
413+ assert "fastaPath" in sample
414+
415+
416+ @pytest .mark .skipif (
417+ not (TEST_OUTPUTS_DIR / "summary" / "qc" / "reference_similarity.tsv" ).exists (),
418+ reason = "Pre-computed outputs not available" ,
419+ )
420+ class TestReferenceSimilarityOutput :
421+ @pytest .fixture
422+ def sim_matrix (self ):
423+ path = TEST_OUTPUTS_DIR / "summary" / "qc" / "reference_similarity.tsv"
424+ return pd .read_csv (path , sep = "\t " , index_col = 0 )
425+
426+ def test_square_matrix (self , sim_matrix ):
427+ assert sim_matrix .shape [0 ] == sim_matrix .shape [1 ]
428+
429+ def test_diagonal_100 (self , sim_matrix ):
430+ for i in range (sim_matrix .shape [0 ]):
431+ assert sim_matrix .iloc [i , i ] == pytest .approx (100.0 )
432+
433+ def test_symmetric (self , sim_matrix ):
434+ for i in range (sim_matrix .shape [0 ]):
435+ for j in range (i + 1 , sim_matrix .shape [1 ]):
436+ assert sim_matrix .iloc [i , j ] == pytest .approx (sim_matrix .iloc [j , i ], abs = 0.01 )
437+
438+ def test_values_in_range (self , sim_matrix ):
439+ assert sim_matrix .min ().min () >= 0
440+ assert sim_matrix .max ().max () <= 100
441+
442+
443+ @pytest .mark .skipif (
444+ not (TEST_OUTPUTS_DIR / "reference" / "trna_only.fa" ).exists (),
445+ reason = "Pre-computed outputs not available" ,
446+ )
447+ class TestTrnaOnlyReference :
448+ @pytest .fixture
449+ def trna_only_path (self ):
450+ return TEST_OUTPUTS_DIR / "reference" / "trna_only.fa"
451+
452+ def test_valid_fasta (self , trna_only_path ):
453+ with open (trna_only_path ) as f :
454+ content = f .read ()
455+ assert content .startswith (">" )
456+ seq_lines = [l for l in content .strip ().split ("\n " ) if not l .startswith (">" )]
457+ assert len (seq_lines ) > 0
458+
459+ def test_no_adapter_substrings (self , trna_only_path ):
460+ adapter_5p = "CCTAAGAGCAAGAAGAAGCCTGG"
461+ adapter_3p_prefix = "GGCTTCTTCTTGCTCTT"
462+ with open (trna_only_path ) as f :
463+ for line in f :
464+ if line .startswith (">" ):
465+ continue
466+ seq = line .strip ().upper ()
467+ assert adapter_5p not in seq , "Found 5' adapter in tRNA-only reference"
468+ assert adapter_3p_prefix not in seq , "Found 3' adapter in tRNA-only reference"
469+
470+ def test_sequences_end_with_cca (self , trna_only_path ):
471+ sequences = {}
472+ current = None
473+ with open (trna_only_path ) as f :
474+ for line in f :
475+ line = line .strip ()
476+ if line .startswith (">" ):
477+ current = line [1 :].split ()[0 ]
478+ sequences [current ] = ""
479+ else :
480+ sequences [current ] += line .upper ()
481+ for name , seq in sequences .items ():
482+ assert seq .endswith ("CCA" ), f"{ name } does not end with CCA: ...{ seq [- 5 :]} "
483+
484+
485+ @pytest .mark .skipif (
486+ not (TEST_OUTPUTS_DIR / "bam" / "final" ).exists ()
487+ or not (TEST_OUTPUTS_DIR / "summary" / "tables" ).exists (),
488+ reason = "Pre-computed outputs not available" ,
489+ )
490+ class TestMultiSampleConsistency :
491+ SAMPLES = ["sample1" , "sample2" ]
492+
493+ def test_both_samples_have_final_bam (self ):
494+ for sample in self .SAMPLES :
495+ path = TEST_OUTPUTS_DIR / "bam" / "final" / sample / f"{ sample } .bam"
496+ assert path .exists (), f"Missing final BAM for { sample } "
497+
498+ def test_both_samples_have_charging_tables (self ):
499+ for sample in self .SAMPLES :
500+ path = (
501+ TEST_OUTPUTS_DIR / "summary" / "tables" / sample
502+ / f"{ sample } .charging_prob.tsv.gz"
503+ )
504+ assert path .exists (), f"Missing charging table for { sample } "
505+
506+ def test_both_samples_have_bcerror (self ):
507+ for sample in self .SAMPLES :
508+ path = (
509+ TEST_OUTPUTS_DIR / "summary" / "tables" / sample
510+ / f"{ sample } .bcerror.tsv.gz"
511+ )
512+ assert path .exists (), f"Missing bcerror for { sample } "
513+
514+ def test_both_samples_have_modkit (self ):
515+ for sample in self .SAMPLES :
516+ path = (
517+ TEST_OUTPUTS_DIR / "summary" / "modkit" / sample
518+ / f"{ sample } .mod_calls.tsv.gz"
519+ )
520+ assert path .exists (), f"Missing modkit output for { sample } "
521+
522+ def test_column_names_match_between_samples (self ):
523+ for table in ["bcerror" , "charging_prob" , "charging.cpm" , "align_stats" ]:
524+ dfs = {}
525+ for sample in self .SAMPLES :
526+ path = (
527+ TEST_OUTPUTS_DIR / "summary" / "tables" / sample
528+ / f"{ sample } .{ table } .tsv.gz"
529+ )
530+ if not path .exists ():
531+ pytest .skip (f"{ table } not found for { sample } " )
532+ dfs [sample ] = pd .read_csv (path , sep = "\t " , nrows = 0 )
533+
534+ cols1 = set (dfs ["sample1" ].columns )
535+ cols2 = set (dfs ["sample2" ].columns )
536+ assert cols1 == cols2 , f"Column mismatch in { table } : { cols1 .symmetric_difference (cols2 )} "
0 commit comments