@@ -67,59 +67,62 @@ use std::ops::Range;
6767use anyhow:: { anyhow, Context } ;
6868
6969
70+ #[ derive( PartialEq , Debug ) ]
7071pub struct DNA ;
72+ #[ derive( PartialEq , Debug ) ]
7173pub struct Protein ;
7274
75+ #[ derive( PartialEq , Debug ) ]
7376pub struct Alignment < Kind > {
74- data : Vec < u8 > ,
75- num_rows : usize ,
76- num_cols : usize ,
77- ids : Vec < String > ,
78- id_to_index : HashMap < String , usize > ,
79- _marker : std:: marker:: PhantomData < Kind > ,
77+ pub sequence_data : Vec < u8 > ,
78+ pub rows : usize ,
79+ pub cols : usize ,
80+ pub ids : Vec < String > ,
81+ pub id_to_index : HashMap < String , usize > ,
82+ pub _marker : std:: marker:: PhantomData < Kind > ,
8083}
8184
8285impl < Kind > Alignment < Kind > {
83- pub fn new ( data : Vec < u8 > , num_rows : usize , ids : Vec < String > ) -> Self {
84- let num_cols = data . len ( ) / num_rows ;
86+ pub fn new ( sequence_data : Vec < u8 > , rows : usize , ids : Vec < String > ) -> Self {
87+ let cols = sequence_data . len ( ) / rows ;
8588 let id_to_index = ids. iter ( ) . enumerate ( ) . map ( |( i, id) | ( id. clone ( ) , i) ) . collect ( ) ;
86- Self { data , num_rows , num_cols , ids, id_to_index, _marker : std:: marker:: PhantomData }
89+ Self { sequence_data , rows , cols , ids, id_to_index, _marker : std:: marker:: PhantomData }
8790 }
8891 //to access a row by ID
8992 pub fn get_row_by_id ( & self , id : & str ) -> Option < & [ u8 ] > {
9093 let & idx = self . id_to_index . get ( id) ?;
91- let start = idx * self . num_cols ;
92- Some ( & self . data [ start..start + self . num_cols ] )
94+ let start = idx * self . cols ;
95+ Some ( & self . sequence_data [ start..start + self . cols ] )
9396 }
9497 //to delete a column by start and end
9598 pub fn remove_columns ( & mut self , start : usize , end : usize ) {
9699 let cols_to_remove = end - start;
97- let mut new_data = Vec :: with_capacity ( self . data . len ( ) - ( self . num_rows * cols_to_remove) ) ;
100+ let mut new_data = Vec :: with_capacity ( self . sequence_data . len ( ) - ( self . rows * cols_to_remove) ) ;
98101
99- for row in self . data . chunks_exact ( self . num_cols ) {
102+ for row in self . sequence_data . chunks_exact ( self . cols ) {
100103 //keeping everything before the slice and everything after
101104 new_data. extend_from_slice ( & row[ ..start] ) ;
102105 new_data. extend_from_slice ( & row[ end..] ) ;
103106 }
104107
105- self . data = new_data;
106- self . num_cols -= cols_to_remove;
108+ self . sequence_data = new_data;
109+ self . cols -= cols_to_remove;
107110 }
108111 //to delete a row by indexed number
109112 pub fn remove_row_by_number ( & mut self , row_idx : usize ) {
110- if row_idx >= self . num_rows { return ; }
113+ if row_idx >= self . rows { return ; }
111114
112- let start = row_idx * self . num_cols ;
113- let end = start + self . num_cols ;
115+ let start = row_idx * self . cols ;
116+ let end = start + self . cols ;
114117
115118 // Remove from the data vector
116- self . data . drain ( start..end) ;
119+ self . sequence_data . drain ( start..end) ;
117120
118121 // Update the ID map and row count
119122 let id_to_remove = self . ids [ row_idx] . clone ( ) ;
120123 self . id_to_index . remove ( & id_to_remove) ;
121124 self . ids . remove ( row_idx) ;
122- self . num_rows -= 1 ;
125+ self . rows -= 1 ;
123126
124127 // Re-index the map (crucial for maintaining O(1) lookup)
125128 self . reindex_map ( ) ;
@@ -140,37 +143,36 @@ impl<Kind> Alignment<Kind> {
140143 pub fn column_counts ( & self , col_idx : usize ) -> HashMap < u8 , usize > {
141144 let mut counts = HashMap :: new ( ) ;
142145
143- for r in 0 ..self . num_rows {
144- let residue = self . data [ r * self . num_cols + col_idx] ;
145- // The .entry() API is the "Senior" way to do this in Rust
146+ for r in 0 ..self . rows {
147+ let residue = self . sequence_data [ r * self . cols + col_idx] ;
146148 * counts. entry ( residue) . or_insert ( 0 ) += 1 ;
147149 }
148150
149151 counts
150152 }
151153 //to get a list vector of counters for every column in the alignment
152154 pub fn all_column_counts ( & self ) -> Vec < HashMap < u8 , usize > > {
153- ( 0 ..self . num_cols )
155+ ( 0 ..self . cols )
154156 . map ( |c| self . column_counts ( c) )
155157 . collect ( )
156158 }
157159 //to check for gaps - Returns true if the proportion of gaps '-' in a column exceeds the provided cutoff (0.0 to 1.0)
158160 pub fn is_gap_heavy ( & self , col_idx : usize , cutoff : f32 ) -> bool {
159161 let mut gap_count = 0 ;
160162
161- for r in 0 ..self . num_rows {
162- if self . data [ r * self . num_cols + col_idx] == b'-' {
163+ for r in 0 ..self . rows {
164+ if self . sequence_data [ r * self . cols + col_idx] == b'-' {
163165 gap_count += 1 ;
164166 }
165167 }
166168
167- let gap_proportion = gap_count as f32 / self . num_rows as f32 ;
169+ let gap_proportion = gap_count as f32 / self . rows as f32 ;
168170 gap_proportion > cutoff
169171 }
170172 //to identify columns that exceed the provided gap threshold
171173 //returns a list of indices to be removed
172174 pub fn identify_gappy_columns ( & self , cutoff : f32 ) -> Vec < usize > {
173- ( 0 ..self . num_cols )
175+ ( 0 ..self . cols )
174176 . filter ( |& c| self . is_gap_heavy ( c, cutoff) )
175177 . collect ( )
176178 }
@@ -187,8 +189,8 @@ impl<Kind> Alignment<Kind> {
187189 //col_range: e.g., Some(0..50) for first 50 columns. None for all.
188190 pub fn display ( & self , row_range : Option < Range < usize > > , col_range : Option < Range < usize > > ) -> io:: Result < ( ) > {
189191 // Fallback to full range if None is provided
190- let rows = row_range. unwrap_or ( 0 ..self . num_rows ) ;
191- let cols = col_range. unwrap_or ( 0 ..self . num_cols ) ;
192+ let rows = row_range. unwrap_or ( 0 ..self . rows ) ;
193+ let cols = col_range. unwrap_or ( 0 ..self . cols ) ;
192194
193195 let stdout = io:: stdout ( ) ;
194196 let mut handle = BufWriter :: new ( stdout. lock ( ) ) ;
@@ -200,14 +202,14 @@ impl<Kind> Alignment<Kind> {
200202 writeln ! ( handle, ">{}" , id) ?;
201203
202204 // MATH: Find the start of the row, then offset by the column start
203- let row_start_in_buffer = r_idx * self . num_cols ;
205+ let row_start_in_buffer = r_idx * self . cols ;
204206 let start = row_start_in_buffer + cols. start ;
205207 let end = row_start_in_buffer + cols. end ;
206208
207209 // Ensure we don't slice past the end of the actual row
208- let safe_end = end. min ( row_start_in_buffer + self . num_cols ) ;
210+ let safe_end = end. min ( row_start_in_buffer + self . cols ) ;
209211
210- if let Some ( row_slice) = self . data . get ( start..safe_end) {
212+ if let Some ( row_slice) = self . sequence_data . get ( start..safe_end) {
211213 handle. write_all ( row_slice) ?;
212214 handle. write_all ( b"\n " ) ?;
213215 }
@@ -223,8 +225,8 @@ impl<Kind> Alignment<Kind> {
223225 col_range : Option < Range < usize > > ,
224226 block_width : usize // Usually 60 or 80
225227 ) -> io:: Result < ( ) > {
226- let rows = row_range. unwrap_or ( 0 ..self . num_rows ) ;
227- let cols = col_range. unwrap_or ( 0 ..self . num_cols ) ;
228+ let rows = row_range. unwrap_or ( 0 ..self . rows ) ;
229+ let cols = col_range. unwrap_or ( 0 ..self . cols ) ;
228230
229231 let stdout = io:: stdout ( ) ;
230232 let mut handle = BufWriter :: new ( stdout. lock ( ) ) ;
@@ -243,11 +245,11 @@ impl<Kind> Alignment<Kind> {
243245 write ! ( handle, "{:<width$} " , display_id, width=max_id_width) ?;
244246
245247 //slice the specific chunk for this row
246- let row_offset = r_idx * self . num_cols ;
248+ let row_offset = r_idx * self . cols ;
247249 let start = row_offset + block_start;
248250 let end = row_offset + block_end;
249251
250- if let Some ( slice) = self . data . get ( start..end) {
252+ if let Some ( slice) = self . sequence_data . get ( start..end) {
251253 handle. write_all ( slice) ?;
252254 writeln ! ( handle) ?;
253255 }
@@ -263,22 +265,22 @@ impl<Kind> Alignment<Kind> {
263265
264266 //to subset the alignment if that is needed - by row and column
265267 pub fn subset ( & self , rows : Range < usize > , cols : Range < usize > ) -> Self {
266- let new_num_rows = rows. len ( ) ;
267- let new_num_cols = cols. len ( ) ;
268- let mut new_data = Vec :: with_capacity ( new_num_rows * new_num_cols ) ;
269- let mut new_ids = Vec :: with_capacity ( new_num_rows ) ;
268+ let new_rows = rows. len ( ) ;
269+ let new_cols = cols. len ( ) ;
270+ let mut new_data = Vec :: with_capacity ( new_rows * new_cols ) ;
271+ let mut new_ids = Vec :: with_capacity ( new_rows ) ;
270272
271273 for r in rows {
272274 // 1. Grab the ID
273275 new_ids. push ( self . ids [ r] . clone ( ) ) ;
274276
275277 // 2. Grab the specific column slice for this row
276- let start = ( r * self . num_cols ) + cols. start ;
277- let end = ( r * self . num_cols ) + cols. end ;
278- new_data. extend_from_slice ( & self . data [ start..end] ) ;
278+ let start = ( r * self . cols ) + cols. start ;
279+ let end = ( r * self . cols ) + cols. end ;
280+ new_data. extend_from_slice ( & self . sequence_data [ start..end] ) ;
279281 }
280282
281- Self :: new ( new_data, new_num_rows , new_ids)
283+ Self :: new ( new_data, new_rows , new_ids)
282284 }
283285 //to write part of the alignment to file
284286 pub fn write_fasta_part < P : AsRef < Path > > (
@@ -288,18 +290,18 @@ impl<Kind> Alignment<Kind> {
288290 col_range : Option < Range < usize > > ,
289291 wrap : Option < usize >
290292 ) -> anyhow:: Result < ( ) > {
291- let rows = row_range. unwrap_or ( 0 ..self . num_rows ) ;
292- let cols = col_range. unwrap_or ( 0 ..self . num_cols ) ;
293+ let rows = row_range. unwrap_or ( 0 ..self . rows ) ;
294+ let cols = col_range. unwrap_or ( 0 ..self . cols ) ;
293295
294296 let file = File :: create ( path) ?;
295297 let mut writer = BufWriter :: new ( file) ;
296298
297299 for r in rows {
298300 writeln ! ( writer, ">{}" , self . ids[ r] ) ?;
299301
300- let start = ( r * self . num_cols ) + cols. start ;
301- let end = ( r * self . num_cols ) + cols. end ;
302- let row_chunk = & self . data [ start..end] ;
302+ let start = ( r * self . cols ) + cols. start ;
303+ let end = ( r * self . cols ) + cols. end ;
304+ let row_chunk = & self . sequence_data [ start..end] ;
303305
304306 if let Some ( w) = wrap {
305307 for chunk in row_chunk. chunks ( w) {
@@ -320,13 +322,13 @@ impl<Kind> Alignment<Kind> {
320322 let file = File :: create ( path) ?;
321323 let mut writer = BufWriter :: new ( file) ;
322324
323- for r in 0 ..self . num_rows {
325+ for r in 0 ..self . rows {
324326 // Write the Header line
325327 writeln ! ( writer, ">{}" , self . ids[ r] ) ?;
326328
327- let start = r * self . num_cols ;
328- let end = start + self . num_cols ;
329- let row_data = & self . data [ start..end] ;
329+ let start = r * self . cols ;
330+ let end = start + self . cols ;
331+ let row_data = & self . sequence_data [ start..end] ;
330332
331333 match wrap {
332334 Some ( width) => {
@@ -353,13 +355,13 @@ impl Alignment<DNA> {
353355 //calculate GC-content of a specific column (homologous site)
354356 pub fn gc_content_at_col ( & self , col_idx : usize ) -> f32 {
355357 let mut gc_count = 0 ;
356- for r in 0 ..self . num_rows {
357- let base = self . data [ r * self . num_cols + col_idx] . to_ascii_uppercase ( ) ;
358+ for r in 0 ..self . rows {
359+ let base = self . sequence_data [ r * self . cols + col_idx] . to_ascii_uppercase ( ) ;
358360 if base == b'G' || base == b'C' {
359361 gc_count += 1 ;
360362 }
361363 }
362- gc_count as f32 / self . num_rows as f32
364+ gc_count as f32 / self . rows as f32
363365 }
364366 //to determine the most frequent base at a column
365367 pub fn most_frequent_base ( & self , column : & [ u8 ] ) -> char {
@@ -378,15 +380,15 @@ impl Alignment<DNA> {
378380 self . iter_cols ( )
379381 . map ( |col| {
380382 // Find the most frequent base in this column
381- // Senior move: ignore gaps '-' during consensus calculation
383+ // ignore gaps '-' during consensus calculation
382384 self . most_frequent_base ( & col)
383385 } )
384386 . collect ( )
385387 }
386388 //to identify a degenerate site
387389 pub fn is_degenerate_site ( & self , col_idx : usize , threshold : f32 ) -> bool {
388390 let counts = self . column_counts ( col_idx) ;
389- let max_freq = * counts. values ( ) . max ( ) . unwrap_or ( & 0 ) as f32 / self . num_rows as f32 ;
391+ let max_freq = * counts. values ( ) . max ( ) . unwrap_or ( & 0 ) as f32 / self . rows as f32 ;
390392 max_freq < threshold
391393 }
392394}
@@ -433,15 +435,15 @@ impl<'a, Kind> Iterator for ColumnIterator<'a, Kind> {
433435 type Item = Vec < u8 > ; // The residues at this homologous site
434436
435437 fn next ( & mut self ) -> Option < Self :: Item > {
436- if self . current_col >= self . alignment . num_cols {
438+ if self . current_col >= self . alignment . cols {
437439 return None ;
438440 }
439441
440442 // Collect residues from each row for the current column
441- let column: Vec < u8 > = ( 0 ..self . alignment . num_rows )
443+ let column: Vec < u8 > = ( 0 ..self . alignment . rows )
442444 . map ( |r| {
443- let idx = ( r * self . alignment . num_cols ) + self . current_col ;
444- self . alignment . data [ idx]
445+ let idx = ( r * self . alignment . cols ) + self . current_col ;
446+ self . alignment . sequence_data [ idx]
445447 } )
446448 . collect ( ) ;
447449
@@ -576,19 +578,25 @@ mod tests {
576578 #[ test]
577579 fn test_column_removal ( ) {
578580 let mut msa = setup_test_dna_msa ( ) ;
579- let original_cols = msa. num_cols ;
581+ let original_cols = msa. cols ;
580582
581583 // Remove columns 1 and 2 (indices 1..3)
582584 msa. remove_columns ( 1 , 3 ) ;
583585
584- assert_eq ! ( msa. num_cols , original_cols - 2 ) ;
586+ assert_eq ! ( msa. cols , original_cols - 2 ) ;
585587
586588 // row 0 was: A [T G] C - - A T G C A T G C A T G C
587589 // should be: A C - - A T G C A T G C A T G C
588590 let result = msa. get_row_by_id ( "seq1" ) . unwrap ( ) ;
589591 assert_eq ! ( result, b"AC--ATGCATGCATGC" ) ;
590592 }
591-
593+ #[ test]
594+ fn test_consensus_sequence ( ) {
595+ let msa = setup_test_dna_msa ( ) ;
596+ let consensus = msa. consensus_sequence ( ) ;
597+ assert_eq ! ( consensus, "ATGC-AATGCATGCATGC" . to_string( ) ) ;
598+ }
599+
592600 #[ test]
593601 fn test_protein_column_entropy ( ) {
594602 // Column 0: Perfectly conserved (M, M) -> Entropy 0
0 commit comments