Skip to content

Commit beea057

Browse files
Merge pull request #64 from pangenome/lower_memory_usage_again
Lower memory usage during IMPG index creationg
2 parents 337b89e + 54d7589 commit beea057

File tree

3 files changed

+15
-23
lines changed

3 files changed

+15
-23
lines changed

src/impg.rs

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
use crate::paf::{PafRecord, ParseErr, Strand};
1+
use crate::paf::{PartialPafRecord, ParseErr, Strand};
22
use crate::seqidx::SequenceIndex;
33
use coitrees::{BasicCOITree, Interval, IntervalTree};
44
use log::debug;
@@ -297,7 +297,7 @@ pub struct Impg {
297297

298298
impl Impg {
299299
pub fn from_multi_paf_records(
300-
records_by_file: &[(Vec<PafRecord>, String)],
300+
records_by_file: &[(Vec<PartialPafRecord>, String)],
301301
seq_index: SequenceIndex
302302
) -> Result<Self, ParseErr> {
303303
// Use par_iter to process the files in parallel and collect both pieces of information
@@ -1360,13 +1360,11 @@ mod tests {
13601360
let target_id = seq_index.get_or_insert_id("t1", Some(200));
13611361
let reader = BufReader::new(&paf_data[..]);
13621362
let expected_records = vec![
1363-
PafRecord {
1363+
PartialPafRecord {
13641364
query_id: query_id,
1365-
query_length: 100,
13661365
query_start: 10,
13671366
query_end: 20,
13681367
target_id: target_id,
1369-
target_length: 200,
13701368
target_start: 30,
13711369
target_end: 40,
13721370
strand_and_cigar_offset: 45, // Forward strand

src/main.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use std::io::{self, BufReader, BufWriter};
1212
use std::num::NonZeroUsize;
1313
use std::collections::hash_map::DefaultHasher;
1414
use std::hash::{Hash, Hasher};
15-
use crate::paf::PafRecord;
15+
use crate::paf::PartialPafRecord;
1616
use rayon::prelude::*;
1717
use std::sync::atomic::{AtomicUsize, Ordering};
1818
use std::sync::{Arc, Mutex};
@@ -425,7 +425,7 @@ fn generate_multi_index(paf_files: &[String], num_threads: NonZeroUsize, custom_
425425
// Process PAF files in parallel using Rayon
426426
let records_by_file: Vec<_> = (0..paf_files.len())
427427
.into_par_iter()
428-
.map(|file_index| -> io::Result<(Vec<PafRecord>, String)> {
428+
.map(|file_index| -> io::Result<(Vec<PartialPafRecord>, String)> {
429429
let paf_file = &paf_files[file_index];
430430

431431
// Increment the counter and get the new value atomically

src/paf.rs

Lines changed: 10 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,13 +4,11 @@ use std::num::ParseIntError;
44
use crate::seqidx::SequenceIndex;
55

66
#[derive(Debug, PartialEq, Clone)]
7-
pub struct PafRecord {
7+
pub struct PartialPafRecord {
88
pub query_id: u32,
9-
pub query_length: usize,
109
pub query_start: usize,
1110
pub query_end: usize,
1211
pub target_id: u32,
13-
pub target_length: usize,
1412
pub target_start: usize,
1513
pub target_end: usize,
1614
pub strand_and_cigar_offset: u64, // Track strand and cigar offset
@@ -25,7 +23,7 @@ pub enum Strand {
2523
Reverse,
2624
}
2725

28-
impl PafRecord {
26+
impl PartialPafRecord {
2927
const STRAND_BIT: u64 = 0x8000000000000000; // Most significant bit for u64
3028

3129
pub fn strand(&self) -> Strand {
@@ -86,11 +84,9 @@ impl PafRecord {
8684
// Create the record and set strand
8785
let mut record = Self {
8886
query_id,
89-
query_length,
9087
query_start,
9188
query_end,
9289
target_id,
93-
target_length,
9490
target_start,
9591
target_end,
9692
strand_and_cigar_offset: cigar_offset,
@@ -113,12 +109,12 @@ pub enum ParseErr {
113109
InvalidFormat(String),
114110
}
115111

116-
pub fn parse_paf<R: BufRead>(reader: R, seq_index: &mut SequenceIndex) -> Result<Vec<PafRecord>, ParseErr> {
112+
pub fn parse_paf<R: BufRead>(reader: R, seq_index: &mut SequenceIndex) -> Result<Vec<PartialPafRecord>, ParseErr> {
117113
let mut bytes_read: u64 = 0;
118114
let mut records = Vec::new();
119115
for line_result in reader.lines() {
120116
let line = line_result.map_err(ParseErr::IoError)?;
121-
let record = PafRecord::parse(&line, bytes_read, seq_index)?;
117+
let record = PartialPafRecord::parse(&line, bytes_read, seq_index)?;
122118
records.push(record);
123119

124120
// Size of line plus newline
@@ -135,21 +131,19 @@ mod tests {
135131
fn test_parse_paf_valid() {
136132
let line = "seq1\t100\t0\t100\t+\tseq2\t100\t0\t100\t60\t100\t255";
137133
let mut seq_index = SequenceIndex::new();
138-
let record = PafRecord::parse(line, 0, &mut seq_index).unwrap();
134+
let record = PartialPafRecord::parse(line, 0, &mut seq_index).unwrap();
139135

140136
// IDs should be 0 and 1 as they're the first entries in the SequenceIndex
141137
let query_id = seq_index.get_id("seq1").unwrap();
142138
let target_id = seq_index.get_id("seq2").unwrap();
143-
139+
144140
assert_eq!(
145141
record,
146-
PafRecord {
142+
PartialPafRecord {
147143
query_id,
148-
query_length: 100,
149144
query_start: 0,
150145
query_end: 100,
151146
target_id,
152-
target_length: 100,
153147
target_start: 0,
154148
target_end: 100,
155149
// If no cigar, then the offset is just the length of the line and cigar_bytes=0
@@ -164,22 +158,22 @@ mod tests {
164158
fn test_parse_paf_valid_2() {
165159
let line = "seq1\t100\t0\t100\t+\tseq2\t100\t0\t100\t60\t100\t255\tcg:Z:10=";
166160
let mut seq_index = SequenceIndex::new();
167-
assert!(PafRecord::parse(line, 0, &mut seq_index).is_ok());
161+
assert!(PartialPafRecord::parse(line, 0, &mut seq_index).is_ok());
168162
}
169163

170164
#[test]
171165
fn test_parse_paf_invalid() {
172166
// it's got a character 'z' in the length field
173167
let line = "seq1\t100\t0\t100\t+\tseq2\t100\tz\t100\t60\t100\t255\tcg:Z:10M";
174168
let mut seq_index = SequenceIndex::new();
175-
assert!(PafRecord::parse(line, 0, &mut seq_index).is_err());
169+
assert!(PartialPafRecord::parse(line, 0, &mut seq_index).is_err());
176170
}
177171

178172
#[test]
179173
fn test_parse_paf_cigar_invalid() {
180174
// it's got Q in the CIGAR string
181175
let line = "seq1\t100\t0\t100\t+\tseq2\t100\tz\t100\t60\t100\t255\tcg:Z:10Q";
182176
let mut seq_index = SequenceIndex::new();
183-
assert!(PafRecord::parse(line, 0, &mut seq_index).is_err());
177+
assert!(PartialPafRecord::parse(line, 0, &mut seq_index).is_err());
184178
}
185179
}

0 commit comments

Comments
 (0)