Merge pull request #112 from pangenome/optional-gzi-indexes

AndreaGuarracino · web-flow · commit 9b368562b17f · 2025-10-09T17:42:53.000+02:00
Make GZI files optional when the input PAF file is bgzip-compressed
diff --git a/README.md b/README.md
@@ -76,19 +76,20 @@ impg query -p alignments.paf -r chr1:1000-2000 -x -m 3
 # Filter by minimum gap-compressed identity
 impg query -p alignments.paf -r chr1:1000-2000 --min-identity 0.9
 
-# Output formats (auto/bed/bedpe/paf/gfa/maf)
+# Output formats (auto/bed/bedpe/paf/gfa/maf/fasta)
+impg query -p alignments.paf -r chr1:1000-2000 -o bed
 impg query -p alignments.paf -r chr1:1000-2000 -o bedpe
-impg query -p alignments.paf -b regions.bed -o paf
+impg query -p alignments.paf -b chr1:1000-2000 -o paf
 
-# GFA/MAF/FASTA output requires sequence files (--sequence-files or --sequence-list)
+# gfa/maf/fasta output requires sequence files (--sequence-files or --sequence-list)
 impg query -p alignments.paf -r chr1:1000-2000 -o gfa --sequence-files ref.fa genomes.fa
 impg query -p alignments.paf -r chr1:1000-2000 -o maf --sequence-list fastas.txt
 impg query -p alignments.paf -r chr1:1000-2000 -o fasta --sequence-files *.fa
 
 # Works with AGC archives too
 impg query -p alignments.paf -r chr1:1000-2000 -o gfa --sequence-files genomes.agc
 
-# FASTA output with reverse complement for reverse strand sequences
+# fasta output with reverse complement for reverse strand sequences
 impg query -p alignments.paf -r chr1:1000-2000 -o fasta --sequence-files *.fa --reverse-complement
 
 # Merge nearby regions (default: 0)
@@ -281,6 +282,13 @@ impg index -p alignments.paf -i custom.impg
 impg index --paf-list paf_files.txt
 ```
 
+**Note on compressed PAF files**: `impg` works directly with bgzip-compressed PAF files (`.paf.gz`, `.paf.bgz`). For large files, creating a GZI index can speed up initial index creation:
+
+```bash
+bgzip -r alignments.paf.gz  # Creates alignments.paf.gz.gzi (optional)
+```
+
+If a `.gzi` file is present, `impg` will automatically use it for faster multithreaded decompression.
 
 ### Common options
 
diff --git a/src/impg.rs b/src/impg.rs
@@ -113,7 +113,6 @@ impl QueryMetadata {
     fn get_cigar_ops(
         &self,
         paf_files: &[String],
-        paf_gzi_indices: &[Option<bgzf::gzi::Index>],
     ) -> Vec<CigarOp> {
         // Allocate space for cigar
         let mut cigar_buffer = vec![0; self.cigar_bytes];
@@ -124,15 +123,13 @@ impl QueryMetadata {
 
         // Get reader and seek start of cigar str
         if [".gz", ".bgz"].iter().any(|e| paf_file.ends_with(e)) {
-            // Get the GZI index for the PAF file
-            let paf_gzi_index = paf_gzi_indices.get(paf_file_index).and_then(Option::as_ref);
-
+            // For compressed files, use virtual position directly
             let mut reader = bgzf::io::Reader::new(File::open(paf_file).unwrap());
-            reader
-                .seek_by_uncompressed_position(paf_gzi_index.unwrap(), self.cigar_offset())
-                .unwrap();
+            let virtual_position = bgzf::VirtualPosition::from(self.cigar_offset());
+            reader.seek(virtual_position).unwrap();
             reader.read_exact(&mut cigar_buffer).unwrap();
         } else {
+            // For uncompressed files, use byte offset
             let mut reader = File::open(paf_file).unwrap();
             reader.seek(SeekFrom::Start(self.cigar_offset())).unwrap();
             reader.read_exact(&mut cigar_buffer).unwrap();
@@ -285,36 +282,21 @@ impl SortedRanges {
 pub struct Impg {
     pub trees: RwLock<TreeMap>,
     pub seq_index: SequenceIndex,
-    paf_files: Vec<String>,                         // List of all PAF files
-    paf_gzi_indices: Vec<Option<bgzf::gzi::Index>>, // Corresponding GZI indices
-    pub forest_map: ForestMap,                      // Forest map for lazy loading
-    index_file_path: String,                        // Path to the index file for lazy loading
+    paf_files: Vec<String>,        // List of all PAF files
+    pub forest_map: ForestMap,     // Forest map for lazy loading
+    index_file_path: String,       // Path to the index file for lazy loading
 }
 
 impl Impg {
     pub fn from_multi_paf_records(
         records_by_file: &[(Vec<PartialPafRecord>, String)],
         seq_index: SequenceIndex,
     ) -> Result<Self, ParseErr> {
-        // Use par_iter to process the files in parallel and collect both pieces of information
-        let (paf_files, paf_gzi_indices): (Vec<String>, Vec<Option<bgzf::gzi::Index>>) =
-            records_by_file
-                .par_iter()
-                .map(|(_, paf_file)| {
-                    let paf_gzi_index = if [".gz", ".bgz"].iter().any(|e| paf_file.ends_with(e)) {
-                        let paf_gzi_file = paf_file.to_owned() + ".gzi";
-                        Some(
-                            bgzf::gzi::fs::read(paf_gzi_file.clone())
-                                .unwrap_or_else(|_| panic!("Could not open {paf_gzi_file}")),
-                        )
-                    } else {
-                        None
-                    };
-
-                    // Return both values as a tuple
-                    (paf_file.clone(), paf_gzi_index)
-                })
-                .unzip(); // Separate the tuples into two vectors
+        // Extract just the PAF file paths
+        let paf_files: Vec<String> = records_by_file
+            .par_iter()
+            .map(|(_, paf_file)| paf_file.clone())
+            .collect();
 
         let intervals: FxHashMap<u32, Vec<Interval<QueryMetadata>>> = records_by_file
             .par_iter()
@@ -373,7 +355,6 @@ impl Impg {
             trees: RwLock::new(trees),
             seq_index,
             paf_files,
-            paf_gzi_indices,
             forest_map: ForestMap::new(), // All trees are in memory, no need for forest map
             index_file_path: String::new(), // All trees are in memory, no need for index file path
         })
@@ -556,27 +537,10 @@ impl Impg {
                     )
                 })?;
 
-        // Determine PAF GZI indices
-        let paf_gzi_indices = paf_files
-            .iter()
-            .map(|paf_file| {
-                if [".gz", ".bgz"].iter().any(|e| paf_file.ends_with(e)) {
-                    let paf_gzi_file = format!("{paf_file}.gzi");
-                    Some(
-                        bgzf::gzi::fs::read(paf_gzi_file.clone())
-                            .unwrap_or_else(|_| panic!("Could not open {paf_gzi_file}")),
-                    )
-                } else {
-                    None
-                }
-            })
-            .collect::<Vec<_>>();
-
         Ok(Self {
             trees: RwLock::new(FxHashMap::default()), // Start with empty trees - load on demand
             seq_index,
             paf_files: paf_files.to_vec(),
-            paf_gzi_indices,
             forest_map,
             index_file_path,
         })
@@ -631,7 +595,7 @@ impl Impg {
                         metadata.query_end,
                         metadata.strand(),
                     ),
-                    &metadata.get_cigar_ops(&self.paf_files, self.paf_gzi_indices.as_ref()),
+                    &metadata.get_cigar_ops(&self.paf_files),
                 );
                 if let Some((
                     adjusted_query_start,
@@ -777,7 +741,7 @@ impl Impg {
                                 metadata.query_end,
                                 metadata.strand(),
                             ),
-                            &metadata.get_cigar_ops(&self.paf_files, self.paf_gzi_indices.as_ref()),
+                            &metadata.get_cigar_ops(&self.paf_files),
                         );
 
                         if let Some((
@@ -1010,10 +974,7 @@ impl Impg {
                                                 metadata.query_end,
                                                 metadata.strand(),
                                             ),
-                                            &metadata.get_cigar_ops(
-                                                &self.paf_files,
-                                                self.paf_gzi_indices.as_ref(),
-                                            ),
+                                            &metadata.get_cigar_ops(&self.paf_files),
                                         );
 
                                         if let Some((
diff --git a/src/main.rs b/src/main.rs
@@ -1496,22 +1496,6 @@ fn generate_multi_index(
     let index_file = get_combined_index_filename(paf_files, custom_index);
     info!("No index found at {index_file}. Creating it now.");
 
-    // Check for missing .gzi files before processing
-    for paf_file in paf_files {
-        if [".gz", ".bgz"].iter().any(|e| paf_file.ends_with(e)) {
-            let gzi_file = format!("{paf_file}.gzi");
-            if !std::path::Path::new(&gzi_file).exists() {
-                return Err(io::Error::new(
-                    io::ErrorKind::NotFound,
-                    format!(
-                        "Compressed PAF file '{paf_file}' requires a .gzi index file. \
-                        Please create it using 'bgzip -r {paf_file}' or decompress the file first."
-                    ),
-                ));
-            }
-        }
-    }
-
     let num_paf_files = paf_files.len();
     // Thread-safe counter for tracking progress
     let files_processed = AtomicUsize::new(0);
@@ -1532,24 +1516,52 @@ fn generate_multi_index(
                 debug!("Processing PAF file ({current_count}/{num_paf_files}): {paf_file}");
 
                 let file = File::open(paf_file)?;
-                let reader: Box<dyn io::Read> =
-                    if [".gz", ".bgz"].iter().any(|e| paf_file.ends_with(e)) {
-                        Box::new(bgzf::io::MultithreadedReader::with_worker_count(
-                            threads, file,
-                        ))
-                    } else {
-                        Box::new(file)
-                    };
-                let reader = BufReader::new(reader);
 
                 // Lock, get IDs, build records
                 let mut seq_index_guard = tmp_seq_index.lock().unwrap();
-                let records = impg::paf::parse_paf(reader, &mut seq_index_guard).map_err(|e| {
-                    io::Error::new(
-                        io::ErrorKind::InvalidData,
-                        format!("Failed to parse PAF records from {paf_file}: {e:?}"),
-                    )
-                })?;
+
+                // Use different parsing logic for compressed vs uncompressed files
+                let records = if [".gz", ".bgz"].iter().any(|e| paf_file.ends_with(e)) {
+                    // For compressed files, check if GZI index exists for optimization
+                    let gzi_path = format!("{}.gzi", paf_file);
+                    if std::path::Path::new(&gzi_path).exists() {
+                        debug!("Found GZI index for {}, using multithreaded decompression", paf_file);
+                        // Use multithreaded reader with GZI for faster parsing
+                        let gzi_index = bgzf::gzi::fs::read(&gzi_path).map_err(|e| {
+                            io::Error::new(
+                                io::ErrorKind::InvalidData,
+                                format!("Failed to read GZI index {}: {}", gzi_path, e),
+                            )
+                        })?;
+                        let mt_reader = bgzf::io::MultithreadedReader::with_worker_count(threads, file);
+                        impg::paf::parse_paf_bgzf_with_gzi(mt_reader, gzi_index, &mut seq_index_guard)
+                            .map_err(|e| {
+                                io::Error::new(
+                                    io::ErrorKind::InvalidData,
+                                    format!("Failed to parse PAF records from {}: {:?}", paf_file, e),
+                                )
+                            })?
+                    } else {
+                        debug!("No GZI index found for {}, using BGZF reader", paf_file);
+                        // No GZI available, use BGZF reader to capture virtual positions directly
+                        let bgzf_reader = bgzf::io::Reader::new(file);
+                        impg::paf::parse_paf_bgzf(bgzf_reader, &mut seq_index_guard).map_err(|e| {
+                            io::Error::new(
+                                io::ErrorKind::InvalidData,
+                                format!("Failed to parse PAF records from {}: {:?}", paf_file, e),
+                            )
+                        })?
+                    }
+                } else {
+                    // For uncompressed files, use regular buffered reader
+                    let reader = BufReader::new(file);
+                    impg::paf::parse_paf(reader, &mut seq_index_guard).map_err(|e| {
+                        io::Error::new(
+                            io::ErrorKind::InvalidData,
+                            format!("Failed to parse PAF records from {}: {:?}", paf_file, e),
+                        )
+                    })?
+                };
 
                 Ok((records, paf_file.clone()))
             },
diff --git a/src/paf.rs b/src/paf.rs
@@ -129,6 +129,73 @@ pub fn parse_paf<R: BufRead>(
     Ok(records)
 }
 
+/// Parse PAF from a BGZF-compressed file, storing virtual positions for seeking.
+/// If a GZI index is provided, uses it for faster multithreaded decompression and converts
+/// uncompressed offsets to virtual positions. Otherwise, reads with single-threaded BGZF reader.
+pub fn parse_paf_bgzf<R: std::io::Read + std::io::Seek>(
+    mut reader: noodles::bgzf::io::Reader<R>,
+    seq_index: &mut SequenceIndex,
+) -> Result<Vec<PartialPafRecord>, ParseErr> {
+    use std::io::BufRead;
+
+    let mut records = Vec::new();
+    let mut line_buf = String::new();
+
+    loop {
+        // Get virtual position BEFORE reading the line
+        let virtual_pos = reader.virtual_position();
+        line_buf.clear();
+
+        let bytes_read = reader.read_line(&mut line_buf).map_err(ParseErr::IoError)?;
+        if bytes_read == 0 {
+            break; // EOF
+        }
+
+        // Remove trailing newline
+        let line = line_buf.trim_end();
+        if line.is_empty() {
+            continue;
+        }
+
+        // Parse the record using the virtual position
+        let record = PartialPafRecord::parse(line, virtual_pos.into(), seq_index)?;
+        records.push(record);
+    }
+
+    Ok(records)
+}
+
+/// Parse PAF from a BGZF-compressed file using a GZI index for faster multithreaded decompression.
+/// After parsing with uncompressed offsets, converts them to virtual positions for seeking.
+pub fn parse_paf_bgzf_with_gzi<R: std::io::Read>(
+    reader: R,
+    gzi_index: noodles::bgzf::gzi::Index,
+    seq_index: &mut SequenceIndex,
+) -> Result<Vec<PartialPafRecord>, ParseErr> {
+    // First pass: parse with uncompressed byte offsets
+    let reader = std::io::BufReader::new(reader);
+    let mut records = parse_paf(reader, seq_index)?;
+
+    // Second pass: convert uncompressed offsets to virtual positions using GZI
+    for record in &mut records {
+        // Extract the uncompressed offset (ignoring the strand bit)
+        let uncompressed_offset = record.strand_and_cigar_offset & !PartialPafRecord::STRAND_BIT;
+
+        // Convert to virtual position using GZI query
+        let virtual_pos = gzi_index
+            .query(uncompressed_offset)
+            .map_err(|e| ParseErr::InvalidFormat(
+                format!("Failed to find virtual position for offset {}: {:?}", uncompressed_offset, e)
+            ))?;
+
+        // Update the record with virtual position, preserving strand bit
+        let strand_bit = record.strand_and_cigar_offset & PartialPafRecord::STRAND_BIT;
+        record.strand_and_cigar_offset = u64::from(virtual_pos) | strand_bit;
+    }
+
+    Ok(records)
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;