Skip to content

Commit 577286c

Browse files
committed
renaming fields for clarity and removing warnings on test
1 parent 5051b64 commit 577286c

File tree

5 files changed

+131
-68
lines changed

5 files changed

+131
-68
lines changed

microBioRust/src/align.rs

Lines changed: 74 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -67,59 +67,62 @@ use std::ops::Range;
6767
use anyhow::{anyhow, Context};
6868

6969

70+
#[derive(PartialEq, Debug)]
7071
pub struct DNA;
72+
#[derive(PartialEq, Debug)]
7173
pub struct Protein;
7274

75+
#[derive(PartialEq, Debug)]
7376
pub struct Alignment<Kind> {
74-
data: Vec<u8>,
75-
num_rows: usize,
76-
num_cols: usize,
77-
ids: Vec<String>,
78-
id_to_index: HashMap<String, usize>,
79-
_marker: std::marker::PhantomData<Kind>,
77+
pub sequence_data: Vec<u8>,
78+
pub rows: usize,
79+
pub cols: usize,
80+
pub ids: Vec<String>,
81+
pub id_to_index: HashMap<String, usize>,
82+
pub _marker: std::marker::PhantomData<Kind>,
8083
}
8184

8285
impl<Kind> Alignment<Kind> {
83-
pub fn new(data: Vec<u8>, num_rows: usize, ids: Vec<String>) -> Self {
84-
let num_cols = data.len() / num_rows;
86+
pub fn new(sequence_data: Vec<u8>, rows: usize, ids: Vec<String>) -> Self {
87+
let cols = sequence_data.len() / rows;
8588
let id_to_index = ids.iter().enumerate().map(|(i, id)| (id.clone(), i)).collect();
86-
Self { data, num_rows, num_cols, ids, id_to_index, _marker: std::marker::PhantomData }
89+
Self { sequence_data, rows, cols, ids, id_to_index, _marker: std::marker::PhantomData }
8790
}
8891
//to access a row by ID
8992
pub fn get_row_by_id(&self, id: &str) -> Option<&[u8]> {
9093
let &idx = self.id_to_index.get(id)?;
91-
let start = idx * self.num_cols;
92-
Some(&self.data[start..start + self.num_cols])
94+
let start = idx * self.cols;
95+
Some(&self.sequence_data[start..start + self.cols])
9396
}
9497
//to delete a column by start and end
9598
pub fn remove_columns(&mut self, start: usize, end: usize) {
9699
let cols_to_remove = end - start;
97-
let mut new_data = Vec::with_capacity(self.data.len() - (self.num_rows * cols_to_remove));
100+
let mut new_data = Vec::with_capacity(self.sequence_data.len() - (self.rows * cols_to_remove));
98101

99-
for row in self.data.chunks_exact(self.num_cols) {
102+
for row in self.sequence_data.chunks_exact(self.cols) {
100103
//keeping everything before the slice and everything after
101104
new_data.extend_from_slice(&row[..start]);
102105
new_data.extend_from_slice(&row[end..]);
103106
}
104107

105-
self.data = new_data;
106-
self.num_cols -= cols_to_remove;
108+
self.sequence_data = new_data;
109+
self.cols -= cols_to_remove;
107110
}
108111
//to delete a row by indexed number
109112
pub fn remove_row_by_number(&mut self, row_idx: usize) {
110-
if row_idx >= self.num_rows { return; }
113+
if row_idx >= self.rows { return; }
111114

112-
let start = row_idx * self.num_cols;
113-
let end = start + self.num_cols;
115+
let start = row_idx * self.cols;
116+
let end = start + self.cols;
114117

115118
// Remove from the data vector
116-
self.data.drain(start..end);
119+
self.sequence_data.drain(start..end);
117120

118121
// Update the ID map and row count
119122
let id_to_remove = self.ids[row_idx].clone();
120123
self.id_to_index.remove(&id_to_remove);
121124
self.ids.remove(row_idx);
122-
self.num_rows -= 1;
125+
self.rows -= 1;
123126

124127
// Re-index the map (crucial for maintaining O(1) lookup)
125128
self.reindex_map();
@@ -140,37 +143,36 @@ impl<Kind> Alignment<Kind> {
140143
pub fn column_counts(&self, col_idx: usize) -> HashMap<u8, usize> {
141144
let mut counts = HashMap::new();
142145

143-
for r in 0..self.num_rows {
144-
let residue = self.data[r * self.num_cols + col_idx];
145-
// The .entry() API is the "Senior" way to do this in Rust
146+
for r in 0..self.rows {
147+
let residue = self.sequence_data[r * self.cols + col_idx];
146148
*counts.entry(residue).or_insert(0) += 1;
147149
}
148150

149151
counts
150152
}
151153
//to get a list vector of counters for every column in the alignment
152154
pub fn all_column_counts(&self) -> Vec<HashMap<u8, usize>> {
153-
(0..self.num_cols)
155+
(0..self.cols)
154156
.map(|c| self.column_counts(c))
155157
.collect()
156158
}
157159
//to check for gaps - Returns true if the proportion of gaps '-' in a column exceeds the provided cutoff (0.0 to 1.0)
158160
pub fn is_gap_heavy(&self, col_idx: usize, cutoff: f32) -> bool {
159161
let mut gap_count = 0;
160162

161-
for r in 0..self.num_rows {
162-
if self.data[r * self.num_cols + col_idx] == b'-' {
163+
for r in 0..self.rows {
164+
if self.sequence_data[r * self.cols + col_idx] == b'-' {
163165
gap_count += 1;
164166
}
165167
}
166168

167-
let gap_proportion = gap_count as f32 / self.num_rows as f32;
169+
let gap_proportion = gap_count as f32 / self.rows as f32;
168170
gap_proportion > cutoff
169171
}
170172
//to identify columns that exceed the provided gap threshold
171173
//returns a list of indices to be removed
172174
pub fn identify_gappy_columns(&self, cutoff: f32) -> Vec<usize> {
173-
(0..self.num_cols)
175+
(0..self.cols)
174176
.filter(|&c| self.is_gap_heavy(c, cutoff))
175177
.collect()
176178
}
@@ -187,8 +189,8 @@ impl<Kind> Alignment<Kind> {
187189
//col_range: e.g., Some(0..50) for first 50 columns. None for all.
188190
pub fn display(&self, row_range: Option<Range<usize>>, col_range: Option<Range<usize>>) -> io::Result<()> {
189191
// Fallback to full range if None is provided
190-
let rows = row_range.unwrap_or(0..self.num_rows);
191-
let cols = col_range.unwrap_or(0..self.num_cols);
192+
let rows = row_range.unwrap_or(0..self.rows);
193+
let cols = col_range.unwrap_or(0..self.cols);
192194

193195
let stdout = io::stdout();
194196
let mut handle = BufWriter::new(stdout.lock());
@@ -200,14 +202,14 @@ impl<Kind> Alignment<Kind> {
200202
writeln!(handle, ">{}", id)?;
201203

202204
// MATH: Find the start of the row, then offset by the column start
203-
let row_start_in_buffer = r_idx * self.num_cols;
205+
let row_start_in_buffer = r_idx * self.cols;
204206
let start = row_start_in_buffer + cols.start;
205207
let end = row_start_in_buffer + cols.end;
206208

207209
// Ensure we don't slice past the end of the actual row
208-
let safe_end = end.min(row_start_in_buffer + self.num_cols);
210+
let safe_end = end.min(row_start_in_buffer + self.cols);
209211

210-
if let Some(row_slice) = self.data.get(start..safe_end) {
212+
if let Some(row_slice) = self.sequence_data.get(start..safe_end) {
211213
handle.write_all(row_slice)?;
212214
handle.write_all(b"\n")?;
213215
}
@@ -223,8 +225,8 @@ impl<Kind> Alignment<Kind> {
223225
col_range: Option<Range<usize>>,
224226
block_width: usize // Usually 60 or 80
225227
) -> io::Result<()> {
226-
let rows = row_range.unwrap_or(0..self.num_rows);
227-
let cols = col_range.unwrap_or(0..self.num_cols);
228+
let rows = row_range.unwrap_or(0..self.rows);
229+
let cols = col_range.unwrap_or(0..self.cols);
228230

229231
let stdout = io::stdout();
230232
let mut handle = BufWriter::new(stdout.lock());
@@ -243,11 +245,11 @@ impl<Kind> Alignment<Kind> {
243245
write!(handle, "{:<width$} ", display_id, width=max_id_width)?;
244246

245247
//slice the specific chunk for this row
246-
let row_offset = r_idx * self.num_cols;
248+
let row_offset = r_idx * self.cols;
247249
let start = row_offset + block_start;
248250
let end = row_offset + block_end;
249251

250-
if let Some(slice) = self.data.get(start..end) {
252+
if let Some(slice) = self.sequence_data.get(start..end) {
251253
handle.write_all(slice)?;
252254
writeln!(handle)?;
253255
}
@@ -263,22 +265,22 @@ impl<Kind> Alignment<Kind> {
263265

264266
//to subset the alignment if that is needed - by row and column
265267
pub fn subset(&self, rows: Range<usize>, cols: Range<usize>) -> Self {
266-
let new_num_rows = rows.len();
267-
let new_num_cols = cols.len();
268-
let mut new_data = Vec::with_capacity(new_num_rows * new_num_cols);
269-
let mut new_ids = Vec::with_capacity(new_num_rows);
268+
let new_rows = rows.len();
269+
let new_cols = cols.len();
270+
let mut new_data = Vec::with_capacity(new_rows * new_cols);
271+
let mut new_ids = Vec::with_capacity(new_rows);
270272

271273
for r in rows {
272274
// 1. Grab the ID
273275
new_ids.push(self.ids[r].clone());
274276

275277
// 2. Grab the specific column slice for this row
276-
let start = (r * self.num_cols) + cols.start;
277-
let end = (r * self.num_cols) + cols.end;
278-
new_data.extend_from_slice(&self.data[start..end]);
278+
let start = (r * self.cols) + cols.start;
279+
let end = (r * self.cols) + cols.end;
280+
new_data.extend_from_slice(&self.sequence_data[start..end]);
279281
}
280282

281-
Self::new(new_data, new_num_rows, new_ids)
283+
Self::new(new_data, new_rows, new_ids)
282284
}
283285
//to write part of the alignment to file
284286
pub fn write_fasta_part<P: AsRef<Path>>(
@@ -288,18 +290,18 @@ impl<Kind> Alignment<Kind> {
288290
col_range: Option<Range<usize>>,
289291
wrap: Option<usize>
290292
) -> anyhow::Result<()> {
291-
let rows = row_range.unwrap_or(0..self.num_rows);
292-
let cols = col_range.unwrap_or(0..self.num_cols);
293+
let rows = row_range.unwrap_or(0..self.rows);
294+
let cols = col_range.unwrap_or(0..self.cols);
293295

294296
let file = File::create(path)?;
295297
let mut writer = BufWriter::new(file);
296298

297299
for r in rows {
298300
writeln!(writer, ">{}", self.ids[r])?;
299301

300-
let start = (r * self.num_cols) + cols.start;
301-
let end = (r * self.num_cols) + cols.end;
302-
let row_chunk = &self.data[start..end];
302+
let start = (r * self.cols) + cols.start;
303+
let end = (r * self.cols) + cols.end;
304+
let row_chunk = &self.sequence_data[start..end];
303305

304306
if let Some(w) = wrap {
305307
for chunk in row_chunk.chunks(w) {
@@ -320,13 +322,13 @@ impl<Kind> Alignment<Kind> {
320322
let file = File::create(path)?;
321323
let mut writer = BufWriter::new(file);
322324

323-
for r in 0..self.num_rows {
325+
for r in 0..self.rows {
324326
// Write the Header line
325327
writeln!(writer, ">{}", self.ids[r])?;
326328

327-
let start = r * self.num_cols;
328-
let end = start + self.num_cols;
329-
let row_data = &self.data[start..end];
329+
let start = r * self.cols;
330+
let end = start + self.cols;
331+
let row_data = &self.sequence_data[start..end];
330332

331333
match wrap {
332334
Some(width) => {
@@ -353,13 +355,13 @@ impl Alignment<DNA> {
353355
//calculate GC-content of a specific column (homologous site)
354356
pub fn gc_content_at_col(&self, col_idx: usize) -> f32 {
355357
let mut gc_count = 0;
356-
for r in 0..self.num_rows {
357-
let base = self.data[r * self.num_cols + col_idx].to_ascii_uppercase();
358+
for r in 0..self.rows {
359+
let base = self.sequence_data[r * self.cols + col_idx].to_ascii_uppercase();
358360
if base == b'G' || base == b'C' {
359361
gc_count += 1;
360362
}
361363
}
362-
gc_count as f32 / self.num_rows as f32
364+
gc_count as f32 / self.rows as f32
363365
}
364366
//to determine the most frequent base at a column
365367
pub fn most_frequent_base(&self, column: &[u8]) -> char {
@@ -378,15 +380,15 @@ impl Alignment<DNA> {
378380
self.iter_cols()
379381
.map(|col| {
380382
// Find the most frequent base in this column
381-
// Senior move: ignore gaps '-' during consensus calculation
383+
// ignore gaps '-' during consensus calculation
382384
self.most_frequent_base(&col)
383385
})
384386
.collect()
385387
}
386388
//to identify a degenerate site
387389
pub fn is_degenerate_site(&self, col_idx: usize, threshold: f32) -> bool {
388390
let counts = self.column_counts(col_idx);
389-
let max_freq = *counts.values().max().unwrap_or(&0) as f32 / self.num_rows as f32;
391+
let max_freq = *counts.values().max().unwrap_or(&0) as f32 / self.rows as f32;
390392
max_freq < threshold
391393
}
392394
}
@@ -433,15 +435,15 @@ impl<'a, Kind> Iterator for ColumnIterator<'a, Kind> {
433435
type Item = Vec<u8>; // The residues at this homologous site
434436

435437
fn next(&mut self) -> Option<Self::Item> {
436-
if self.current_col >= self.alignment.num_cols {
438+
if self.current_col >= self.alignment.cols {
437439
return None;
438440
}
439441

440442
// Collect residues from each row for the current column
441-
let column: Vec<u8> = (0..self.alignment.num_rows)
443+
let column: Vec<u8> = (0..self.alignment.rows)
442444
.map(|r| {
443-
let idx = (r * self.alignment.num_cols) + self.current_col;
444-
self.alignment.data[idx]
445+
let idx = (r * self.alignment.cols) + self.current_col;
446+
self.alignment.sequence_data[idx]
445447
})
446448
.collect();
447449

@@ -576,19 +578,25 @@ mod tests {
576578
#[test]
577579
fn test_column_removal() {
578580
let mut msa = setup_test_dna_msa();
579-
let original_cols = msa.num_cols;
581+
let original_cols = msa.cols;
580582

581583
// Remove columns 1 and 2 (indices 1..3)
582584
msa.remove_columns(1, 3);
583585

584-
assert_eq!(msa.num_cols, original_cols - 2);
586+
assert_eq!(msa.cols, original_cols - 2);
585587

586588
// row 0 was: A [T G] C - - A T G C A T G C A T G C
587589
// should be: A C - - A T G C A T G C A T G C
588590
let result = msa.get_row_by_id("seq1").unwrap();
589591
assert_eq!(result, b"AC--ATGCATGCATGC");
590592
}
591-
593+
#[test]
594+
fn test_consensus_sequence() {
595+
let msa = setup_test_dna_msa();
596+
let consensus = msa.consensus_sequence();
597+
assert_eq!(consensus, "ATGC-AATGCATGCATGC".to_string());
598+
}
599+
592600
#[test]
593601
fn test_protein_column_entropy() {
594602
// Column 0: Perfectly conserved (M, M) -> Entropy 0

microBioRust/src/blast.rs

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,7 @@
1+
#![allow(unused_imports)]
2+
#![allow(unused_mut)]
3+
#![allow(unused_variables)]
4+
#![allow(dead_code)]
15
//! async streaming BLAST parsers for the (outfmt 6 and outfmt 5),
26
//! gzip-transparent reader, option of --json streaming output
37
//!
@@ -90,7 +94,7 @@
9094
//! }
9195
//!
9296
//!```
93-
97+
#![allow(unused_imports)]
9498
use anyhow::{Context, Result};
9599
use async_compression::tokio::bufread::GzipDecoder as AsyncGzDecoder;
96100
use clap::Parser;
@@ -672,4 +676,4 @@ mod tests {
672676
assert_eq!(it.query_def.unwrap(), "My query");
673677
assert_eq!(it.hits.len(), 1);
674678
}
675-
}
679+
}

microBioRust/tests/blast_parser.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
#![allow(unused_imports)]
12
use microBioRust::blast::{stream_outfmt6_to_json, AsyncBlastXmlIter};
23
use std::io::Cursor;
34
use tokio::io::BufReader;

0 commit comments

Comments
 (0)