@@ -125,51 +125,99 @@ def test_remove_edge_labels(phylogeny_builder: Phylogeny):
125125# assert len(lines) == 4
126126
127127def test_generate_pairwise_comparison_table (phylogeny_builder : Phylogeny , tmp_path : Path ):
128- """Test _generate_pairwise_comparison_table method"""
129- # Create mock sequences
128+ """Test _generate_pairwise_comparison_table method with current implementation """
129+ # Create test sequences with clear, verifiable patterns
130130 sequences = {
131- "sample1" : "ATCGATCGATCG" ,
132- "sample2" : "ATCGATCGATCG" ,
133- "sample3" : "ATCGNTCGATCG"
131+ "sample1" : "ATCGATCGATCG" , # All ACGT (12 maximal info positions)
132+ "sample2" : "ATCGATCGATCG" , # Identical to sample1
133+ "sample3" : "ATRYNNNHBVVT" , # Mixed: ACGT, 2-base IUPAC (R,Y), 3-base IUPAC (H,B,V), and N's
134+ "sample4" : "ATNYNNNHBVNT" , # Similar to sample3 but with more Ns
134135 }
135136
136- # Create mock distance matrix
137+ # Mock distance matrix with proper get_distance method
137138 class MockDistanceMatrix :
138139 def __init__ (self ):
139- self .names = ['sample1' , 'sample2' , 'sample3' ]
140+ self .names = ['sample1' , 'sample2' , 'sample3' , 'sample4' ]
141+ self .distances = {
142+ 'sample1-sample2' : 0.00005 , # same_strain
143+ 'sample1-sample3' : 0.008 , # same_species
144+ 'sample1-sample4' : 0.012 , # same_subspecies
145+ 'sample2-sample3' : 0.008 , # same_species
146+ 'sample2-sample4' : 0.012 , # same_subspecies
147+ 'sample3-sample4' : 0.025 , # divergent
148+ }
140149
141150 def get_distance (self , name1 , name2 ):
142- # Simple distance calculation for testing
143151 if name1 == name2 :
144152 return 0.0
145- return 0.1
153+ key1 = f"{ name1 } -{ name2 } "
154+ key2 = f"{ name2 } -{ name1 } "
155+ return self .distances .get (key1 , self .distances .get (key2 , 0.1 ))
146156
147157 dists = MockDistanceMatrix ()
148-
149- # Define output file
150158 output_file = tmp_path / "test_comparison.tsv"
151159
152160 # Call the method
153161 phylogeny_builder ._generate_pairwise_comparison_table (sequences , dists , output_file )
154162
155- # Verify file exists
163+ # Verify file exists and can be read
156164 assert output_file .exists ()
157-
158- # Read and verify content
159165 df = pd .read_csv (output_file , sep = '\t ' )
160166
161- # Check that we have the expected columns
162- expected_columns = ['sample1' , 'sample2' , 'total_length' , 'coverage_overlap_nq' ,
163- 'coverage_overlap_all' , 'compared_bases_count' , 'coverage_sample1_pct' ,
164- 'coverage_sample2_pct' , 'mean_depth_sample1' , 'mean_depth_sample2' , 'distance' ]
165- assert all (col in df .columns for col in expected_columns )
167+ # Check columns match current implementation exactly
168+ expected_columns = [
169+ 'sample1' , 'sample2' , 'total_length' , 'overlap_noN_info_count' ,
170+ 'overlap_noIUPAC_info_count' , 'overlap_noN_info_pc' , 'overlap_noIUPAC_info_pc' ,
171+ 'noN_info_pc_sample1' , 'noN_info_pc_sample2' , 'noIUPAC_info_pc_sample1' ,
172+ 'noIUPAC_info_pc_sample2' , 'distance' , 'distance_category'
173+ ]
174+ assert list (df .columns ) == expected_columns
175+
176+ # Check 6 rows for 4 samples (C(4,2) = 6)
177+ assert len (df ) == 6
178+
179+ # Test sample1 vs sample2 (identical, all ACGT, distance 0.00005 -> same_strain)
180+ row_12 = df [(df ['sample1' ] == 'sample1' ) & (df ['sample2' ] == 'sample2' )].iloc [0 ]
181+ assert row_12 ['total_length' ] == 12
182+ assert row_12 ['overlap_noN_info_count' ] == 12
183+ assert row_12 ['overlap_noIUPAC_info_count' ] == 12
184+ assert row_12 ['distance_category' ] == 'same_strain'
185+
186+ # Test sample1 vs sample3 (distance 0.008 -> same_species)
187+ row_13 = df [(df ['sample1' ] == 'sample1' ) & (df ['sample2' ] == 'sample3' )].iloc [0 ]
188+ assert row_13 ['total_length' ] == 12
189+
190+ # Both have minimal info at positions: 0,1,2,3,7,8,9,10,11 = 9 positions
191+ # (positions 4,5,6 have N in sample3 which is NOT in MINIMAL_INFO)
192+ assert row_13 ['overlap_noN_info_count' ] == 9
193+
194+ # Both have maximal info (ACGT) at positions: 0,1,11 = 3 positions
195+ assert row_13 ['overlap_noIUPAC_info_count' ] == 3
196+ assert row_13 ['distance_category' ] == 'same_species'
197+
198+ # Test sample3 vs sample4 (distance 0.025 -> divergent)
199+ row_34 = df [(df ['sample1' ] == 'sample3' ) & (df ['sample2' ] == 'sample4' )].iloc [0 ]
200+ assert row_34 ['distance_category' ] == 'divergent'
201+
202+ # Verify data integrity
203+ assert all (df ['total_length' ] == 12 )
204+ assert all (df ['overlap_noN_info_count' ] >= df ['overlap_noIUPAC_info_count' ])
205+ assert all (df ['overlap_noN_info_pc' ] >= df ['overlap_noIUPAC_info_pc' ])
206+ assert all (df ['distance' ] >= 0 )
207+ assert all (df ['distance_category' ].isin (['same_strain' , 'same_species' , 'same_subspecies' , 'divergent' ]))
166208
167- # Check that we have the expected number of rows (3 samples, so 3 pairs: 1-2, 1-3, 2-3)
168- # Actually, with 3 samples we should have 3 pairs: (0,1), (0,2), (1,2)
169- assert len ( df ) == 3
209+ # Check each category has correct distance ranges
210+ strain_rows = df [ df [ 'distance_category' ] == 'same_strain' ]
211+ assert all ( strain_rows [ 'distance' ] <= 0.0001 )
170212
171- # Check that distances are properly recorded
172- assert all (df ['distance' ] >= 0 )
213+ species_rows = df [df ['distance_category' ] == 'same_species' ]
214+ assert all ((species_rows ['distance' ] > 0.0001 ) & (species_rows ['distance' ] <= 0.01 ))
215+
216+ subspecies_rows = df [df ['distance_category' ] == 'same_subspecies' ]
217+ assert all ((subspecies_rows ['distance' ] > 0.01 ) & (subspecies_rows ['distance' ] <= 0.015 ))
218+
219+ divergent_rows = df [df ['distance_category' ] == 'divergent' ]
220+ assert all (divergent_rows ['distance' ] > 0.015 )
173221
174222
175223def test_execute (phylogeny_builder : Phylogeny ):
0 commit comments