Skip to content

Commit 9b36856

Browse files
Merge pull request #112 from pangenome/optional-gzi-indexes
Make GZI files optional when the input PAF file is bgzip-compressed
2 parents 50ac821 + 56ca22c commit 9b36856

File tree

4 files changed

+137
-89
lines changed

4 files changed

+137
-89
lines changed

README.md

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -76,19 +76,20 @@ impg query -p alignments.paf -r chr1:1000-2000 -x -m 3
7676
# Filter by minimum gap-compressed identity
7777
impg query -p alignments.paf -r chr1:1000-2000 --min-identity 0.9
7878

79-
# Output formats (auto/bed/bedpe/paf/gfa/maf)
79+
# Output formats (auto/bed/bedpe/paf/gfa/maf/fasta)
80+
impg query -p alignments.paf -r chr1:1000-2000 -o bed
8081
impg query -p alignments.paf -r chr1:1000-2000 -o bedpe
81-
impg query -p alignments.paf -b regions.bed -o paf
82+
impg query -p alignments.paf -b chr1:1000-2000 -o paf
8283

83-
# GFA/MAF/FASTA output requires sequence files (--sequence-files or --sequence-list)
84+
# gfa/maf/fasta output requires sequence files (--sequence-files or --sequence-list)
8485
impg query -p alignments.paf -r chr1:1000-2000 -o gfa --sequence-files ref.fa genomes.fa
8586
impg query -p alignments.paf -r chr1:1000-2000 -o maf --sequence-list fastas.txt
8687
impg query -p alignments.paf -r chr1:1000-2000 -o fasta --sequence-files *.fa
8788

8889
# Works with AGC archives too
8990
impg query -p alignments.paf -r chr1:1000-2000 -o gfa --sequence-files genomes.agc
9091

91-
# FASTA output with reverse complement for reverse strand sequences
92+
# fasta output with reverse complement for reverse strand sequences
9293
impg query -p alignments.paf -r chr1:1000-2000 -o fasta --sequence-files *.fa --reverse-complement
9394

9495
# Merge nearby regions (default: 0)
@@ -281,6 +282,13 @@ impg index -p alignments.paf -i custom.impg
281282
impg index --paf-list paf_files.txt
282283
```
283284

285+
**Note on compressed PAF files**: `impg` works directly with bgzip-compressed PAF files (`.paf.gz`, `.paf.bgz`). For large files, creating a GZI index can speed up initial index creation:
286+
287+
```bash
288+
bgzip -r alignments.paf.gz # Creates alignments.paf.gz.gzi (optional)
289+
```
290+
291+
If a `.gzi` file is present, `impg` will automatically use it for faster multithreaded decompression.
284292

285293
### Common options
286294

src/impg.rs

Lines changed: 15 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,6 @@ impl QueryMetadata {
113113
fn get_cigar_ops(
114114
&self,
115115
paf_files: &[String],
116-
paf_gzi_indices: &[Option<bgzf::gzi::Index>],
117116
) -> Vec<CigarOp> {
118117
// Allocate space for cigar
119118
let mut cigar_buffer = vec![0; self.cigar_bytes];
@@ -124,15 +123,13 @@ impl QueryMetadata {
124123

125124
// Get reader and seek start of cigar str
126125
if [".gz", ".bgz"].iter().any(|e| paf_file.ends_with(e)) {
127-
// Get the GZI index for the PAF file
128-
let paf_gzi_index = paf_gzi_indices.get(paf_file_index).and_then(Option::as_ref);
129-
126+
// For compressed files, use virtual position directly
130127
let mut reader = bgzf::io::Reader::new(File::open(paf_file).unwrap());
131-
reader
132-
.seek_by_uncompressed_position(paf_gzi_index.unwrap(), self.cigar_offset())
133-
.unwrap();
128+
let virtual_position = bgzf::VirtualPosition::from(self.cigar_offset());
129+
reader.seek(virtual_position).unwrap();
134130
reader.read_exact(&mut cigar_buffer).unwrap();
135131
} else {
132+
// For uncompressed files, use byte offset
136133
let mut reader = File::open(paf_file).unwrap();
137134
reader.seek(SeekFrom::Start(self.cigar_offset())).unwrap();
138135
reader.read_exact(&mut cigar_buffer).unwrap();
@@ -285,36 +282,21 @@ impl SortedRanges {
285282
pub struct Impg {
286283
pub trees: RwLock<TreeMap>,
287284
pub seq_index: SequenceIndex,
288-
paf_files: Vec<String>, // List of all PAF files
289-
paf_gzi_indices: Vec<Option<bgzf::gzi::Index>>, // Corresponding GZI indices
290-
pub forest_map: ForestMap, // Forest map for lazy loading
291-
index_file_path: String, // Path to the index file for lazy loading
285+
paf_files: Vec<String>, // List of all PAF files
286+
pub forest_map: ForestMap, // Forest map for lazy loading
287+
index_file_path: String, // Path to the index file for lazy loading
292288
}
293289

294290
impl Impg {
295291
pub fn from_multi_paf_records(
296292
records_by_file: &[(Vec<PartialPafRecord>, String)],
297293
seq_index: SequenceIndex,
298294
) -> Result<Self, ParseErr> {
299-
// Use par_iter to process the files in parallel and collect both pieces of information
300-
let (paf_files, paf_gzi_indices): (Vec<String>, Vec<Option<bgzf::gzi::Index>>) =
301-
records_by_file
302-
.par_iter()
303-
.map(|(_, paf_file)| {
304-
let paf_gzi_index = if [".gz", ".bgz"].iter().any(|e| paf_file.ends_with(e)) {
305-
let paf_gzi_file = paf_file.to_owned() + ".gzi";
306-
Some(
307-
bgzf::gzi::fs::read(paf_gzi_file.clone())
308-
.unwrap_or_else(|_| panic!("Could not open {paf_gzi_file}")),
309-
)
310-
} else {
311-
None
312-
};
313-
314-
// Return both values as a tuple
315-
(paf_file.clone(), paf_gzi_index)
316-
})
317-
.unzip(); // Separate the tuples into two vectors
295+
// Extract just the PAF file paths
296+
let paf_files: Vec<String> = records_by_file
297+
.par_iter()
298+
.map(|(_, paf_file)| paf_file.clone())
299+
.collect();
318300

319301
let intervals: FxHashMap<u32, Vec<Interval<QueryMetadata>>> = records_by_file
320302
.par_iter()
@@ -373,7 +355,6 @@ impl Impg {
373355
trees: RwLock::new(trees),
374356
seq_index,
375357
paf_files,
376-
paf_gzi_indices,
377358
forest_map: ForestMap::new(), // All trees are in memory, no need for forest map
378359
index_file_path: String::new(), // All trees are in memory, no need for index file path
379360
})
@@ -556,27 +537,10 @@ impl Impg {
556537
)
557538
})?;
558539

559-
// Determine PAF GZI indices
560-
let paf_gzi_indices = paf_files
561-
.iter()
562-
.map(|paf_file| {
563-
if [".gz", ".bgz"].iter().any(|e| paf_file.ends_with(e)) {
564-
let paf_gzi_file = format!("{paf_file}.gzi");
565-
Some(
566-
bgzf::gzi::fs::read(paf_gzi_file.clone())
567-
.unwrap_or_else(|_| panic!("Could not open {paf_gzi_file}")),
568-
)
569-
} else {
570-
None
571-
}
572-
})
573-
.collect::<Vec<_>>();
574-
575540
Ok(Self {
576541
trees: RwLock::new(FxHashMap::default()), // Start with empty trees - load on demand
577542
seq_index,
578543
paf_files: paf_files.to_vec(),
579-
paf_gzi_indices,
580544
forest_map,
581545
index_file_path,
582546
})
@@ -631,7 +595,7 @@ impl Impg {
631595
metadata.query_end,
632596
metadata.strand(),
633597
),
634-
&metadata.get_cigar_ops(&self.paf_files, self.paf_gzi_indices.as_ref()),
598+
&metadata.get_cigar_ops(&self.paf_files),
635599
);
636600
if let Some((
637601
adjusted_query_start,
@@ -777,7 +741,7 @@ impl Impg {
777741
metadata.query_end,
778742
metadata.strand(),
779743
),
780-
&metadata.get_cigar_ops(&self.paf_files, self.paf_gzi_indices.as_ref()),
744+
&metadata.get_cigar_ops(&self.paf_files),
781745
);
782746

783747
if let Some((
@@ -1010,10 +974,7 @@ impl Impg {
1010974
metadata.query_end,
1011975
metadata.strand(),
1012976
),
1013-
&metadata.get_cigar_ops(
1014-
&self.paf_files,
1015-
self.paf_gzi_indices.as_ref(),
1016-
),
977+
&metadata.get_cigar_ops(&self.paf_files),
1017978
);
1018979

1019980
if let Some((

src/main.rs

Lines changed: 43 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1496,22 +1496,6 @@ fn generate_multi_index(
14961496
let index_file = get_combined_index_filename(paf_files, custom_index);
14971497
info!("No index found at {index_file}. Creating it now.");
14981498

1499-
// Check for missing .gzi files before processing
1500-
for paf_file in paf_files {
1501-
if [".gz", ".bgz"].iter().any(|e| paf_file.ends_with(e)) {
1502-
let gzi_file = format!("{paf_file}.gzi");
1503-
if !std::path::Path::new(&gzi_file).exists() {
1504-
return Err(io::Error::new(
1505-
io::ErrorKind::NotFound,
1506-
format!(
1507-
"Compressed PAF file '{paf_file}' requires a .gzi index file. \
1508-
Please create it using 'bgzip -r {paf_file}' or decompress the file first."
1509-
),
1510-
));
1511-
}
1512-
}
1513-
}
1514-
15151499
let num_paf_files = paf_files.len();
15161500
// Thread-safe counter for tracking progress
15171501
let files_processed = AtomicUsize::new(0);
@@ -1532,24 +1516,52 @@ fn generate_multi_index(
15321516
debug!("Processing PAF file ({current_count}/{num_paf_files}): {paf_file}");
15331517

15341518
let file = File::open(paf_file)?;
1535-
let reader: Box<dyn io::Read> =
1536-
if [".gz", ".bgz"].iter().any(|e| paf_file.ends_with(e)) {
1537-
Box::new(bgzf::io::MultithreadedReader::with_worker_count(
1538-
threads, file,
1539-
))
1540-
} else {
1541-
Box::new(file)
1542-
};
1543-
let reader = BufReader::new(reader);
15441519

15451520
// Lock, get IDs, build records
15461521
let mut seq_index_guard = tmp_seq_index.lock().unwrap();
1547-
let records = impg::paf::parse_paf(reader, &mut seq_index_guard).map_err(|e| {
1548-
io::Error::new(
1549-
io::ErrorKind::InvalidData,
1550-
format!("Failed to parse PAF records from {paf_file}: {e:?}"),
1551-
)
1552-
})?;
1522+
1523+
// Use different parsing logic for compressed vs uncompressed files
1524+
let records = if [".gz", ".bgz"].iter().any(|e| paf_file.ends_with(e)) {
1525+
// For compressed files, check if GZI index exists for optimization
1526+
let gzi_path = format!("{}.gzi", paf_file);
1527+
if std::path::Path::new(&gzi_path).exists() {
1528+
debug!("Found GZI index for {}, using multithreaded decompression", paf_file);
1529+
// Use multithreaded reader with GZI for faster parsing
1530+
let gzi_index = bgzf::gzi::fs::read(&gzi_path).map_err(|e| {
1531+
io::Error::new(
1532+
io::ErrorKind::InvalidData,
1533+
format!("Failed to read GZI index {}: {}", gzi_path, e),
1534+
)
1535+
})?;
1536+
let mt_reader = bgzf::io::MultithreadedReader::with_worker_count(threads, file);
1537+
impg::paf::parse_paf_bgzf_with_gzi(mt_reader, gzi_index, &mut seq_index_guard)
1538+
.map_err(|e| {
1539+
io::Error::new(
1540+
io::ErrorKind::InvalidData,
1541+
format!("Failed to parse PAF records from {}: {:?}", paf_file, e),
1542+
)
1543+
})?
1544+
} else {
1545+
debug!("No GZI index found for {}, using BGZF reader", paf_file);
1546+
// No GZI available, use BGZF reader to capture virtual positions directly
1547+
let bgzf_reader = bgzf::io::Reader::new(file);
1548+
impg::paf::parse_paf_bgzf(bgzf_reader, &mut seq_index_guard).map_err(|e| {
1549+
io::Error::new(
1550+
io::ErrorKind::InvalidData,
1551+
format!("Failed to parse PAF records from {}: {:?}", paf_file, e),
1552+
)
1553+
})?
1554+
}
1555+
} else {
1556+
// For uncompressed files, use regular buffered reader
1557+
let reader = BufReader::new(file);
1558+
impg::paf::parse_paf(reader, &mut seq_index_guard).map_err(|e| {
1559+
io::Error::new(
1560+
io::ErrorKind::InvalidData,
1561+
format!("Failed to parse PAF records from {}: {:?}", paf_file, e),
1562+
)
1563+
})?
1564+
};
15531565

15541566
Ok((records, paf_file.clone()))
15551567
},

src/paf.rs

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,73 @@ pub fn parse_paf<R: BufRead>(
129129
Ok(records)
130130
}
131131

132+
/// Parse PAF from a BGZF-compressed file, storing virtual positions for seeking.
133+
/// If a GZI index is provided, uses it for faster multithreaded decompression and converts
134+
/// uncompressed offsets to virtual positions. Otherwise, reads with single-threaded BGZF reader.
135+
pub fn parse_paf_bgzf<R: std::io::Read + std::io::Seek>(
136+
mut reader: noodles::bgzf::io::Reader<R>,
137+
seq_index: &mut SequenceIndex,
138+
) -> Result<Vec<PartialPafRecord>, ParseErr> {
139+
use std::io::BufRead;
140+
141+
let mut records = Vec::new();
142+
let mut line_buf = String::new();
143+
144+
loop {
145+
// Get virtual position BEFORE reading the line
146+
let virtual_pos = reader.virtual_position();
147+
line_buf.clear();
148+
149+
let bytes_read = reader.read_line(&mut line_buf).map_err(ParseErr::IoError)?;
150+
if bytes_read == 0 {
151+
break; // EOF
152+
}
153+
154+
// Remove trailing newline
155+
let line = line_buf.trim_end();
156+
if line.is_empty() {
157+
continue;
158+
}
159+
160+
// Parse the record using the virtual position
161+
let record = PartialPafRecord::parse(line, virtual_pos.into(), seq_index)?;
162+
records.push(record);
163+
}
164+
165+
Ok(records)
166+
}
167+
168+
/// Parse PAF from a BGZF-compressed file using a GZI index for faster multithreaded decompression.
169+
/// After parsing with uncompressed offsets, converts them to virtual positions for seeking.
170+
pub fn parse_paf_bgzf_with_gzi<R: std::io::Read>(
171+
reader: R,
172+
gzi_index: noodles::bgzf::gzi::Index,
173+
seq_index: &mut SequenceIndex,
174+
) -> Result<Vec<PartialPafRecord>, ParseErr> {
175+
// First pass: parse with uncompressed byte offsets
176+
let reader = std::io::BufReader::new(reader);
177+
let mut records = parse_paf(reader, seq_index)?;
178+
179+
// Second pass: convert uncompressed offsets to virtual positions using GZI
180+
for record in &mut records {
181+
// Extract the uncompressed offset (ignoring the strand bit)
182+
let uncompressed_offset = record.strand_and_cigar_offset & !PartialPafRecord::STRAND_BIT;
183+
184+
// Convert to virtual position using GZI query
185+
let virtual_pos = gzi_index
186+
.query(uncompressed_offset)
187+
.map_err(|e| ParseErr::InvalidFormat(
188+
format!("Failed to find virtual position for offset {}: {:?}", uncompressed_offset, e)
189+
))?;
190+
191+
// Update the record with virtual position, preserving strand bit
192+
let strand_bit = record.strand_and_cigar_offset & PartialPafRecord::STRAND_BIT;
193+
record.strand_and_cigar_offset = u64::from(virtual_pos) | strand_bit;
194+
}
195+
196+
Ok(records)
197+
}
198+
132199
#[cfg(test)]
133200
mod tests {
134201
use super::*;

0 commit comments

Comments
 (0)