added NUMTs breakends caller (#22)

hangsuUNC · web-flow · commit aacd30d3b733 · 2026-01-30T11:15:05.000-05:00
* added NUMTs breakends caller

* added README call-numts

* added version
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "Himito"
-version = "0.1.0"
+version = "1.0.0"
 edition = "2021"
 
 [dependencies]
diff --git a/README.md b/README.md
@@ -63,3 +63,41 @@ msbwt2-build -o sr_msbwt.npy <srWGS.chrM.fasta.gz>
 # enumerate all possible haplotypes within windows
 ./target/release/Himito minorhap -g <output.gfa> -o <output.allhaplotype.fasta> -s <sample_id>
 ```
+
+### Jan 20th 2026 Updates: NUMTs Breakpoint Calling
+
+Himito can identify and call NUMTs (Nuclear Mitochondrial DNA segments) breakpoints from long-read BAM files. NUMTs are segments of mitochondrial DNA that have been inserted into the nuclear genome. The NUMTs calling functionality (`callnumts.rs`) works by:
+
+1. **Identifying NUMT reads**: Parses supplementary alignment (SA) tags in BAM files to identify reads that have:
+   - Primary alignment to mitochondrial DNA (chrM) and supplementary alignment to nuclear chromosomes, OR
+   - Primary alignment to nuclear chromosomes and supplementary alignment to mitochondrial DNA
+
+2. **Merging breakpoints**: Groups breakpoints by chromosome and merges consecutive breakpoints within a specified gap threshold (`max_gap_threshold`) to create intervals. This helps reduce noise and identify true NUMT insertion sites.
+
+3. **Writing BND records**: Outputs breakend (BND) structural variant records in VCF format, representing the NUMT breakpoints. Each BND record includes:
+   - Breakpoint positions on both the nuclear chromosome and mitochondrial genome
+   - Strand orientation information
+   - Supporting read counts (allele depth)
+   - Properly formatted BND ALT fields based on strand orientations
+
+**Usage:**
+```bash
+./target/release/Himito call-numts -i sample.bam \
+                               -c chrM \
+                               -m 10000 \
+                               -r hg38.fa \
+                               -o numts_breakpoints.vcf \
+                               -s HG002 \
+                               -a 2
+```
+**Output format:**
+The output VCF file contains BND (breakend) records in standard VCFv4.3 format. Each record represents a NUMT breakpoint with:
+- `CHROM`: Nuclear chromosome where the breakpoint occurs
+- `POS`: Breakpoint position (1-based)
+- `ALT`: BND format string indicating the connection to mitochondrial DNA
+- `INFO`: Contains `SVTYPE=BND`
+- `FORMAT`: GT (genotype) and AD (allele depth/supporting read count)
+
+This will identify NUMT breakpoints where reads have alignments spanning both mitochondrial and nuclear genomes, merge breakpoints within 10kb, and output only those with at least 2 supporting reads. If you want to examine all possible NUMTs BND, adjust -a to <=1.
+
+ToDo: may perform local assembly to find sequence resolved NUMTs insertions
diff --git a/src/agg.rs b/src/agg.rs
@@ -418,7 +418,7 @@ pub fn construct_major_haplotype(graph:&GraphicalGenome) -> String {
     anchorlist.sort();
     let mut src = anchorlist.first().unwrap().to_string();
     let mut next_edge = "".to_string();
-    let mut dst = anchorlist.last().unwrap().to_string();
+    let dst = anchorlist.last().unwrap().to_string();
     let mut haplotype = String::new();
     // println!("{}", src);
     let mut entity_set = HashSet::new();
@@ -475,7 +475,7 @@ pub fn construct_major_haplotype_entitylist(graph:&GraphicalGenome) -> Vec<Strin
     anchorlist.sort();
     let mut src = anchorlist.first().unwrap().to_string();
     let mut next_edge = "".to_string();
-    let mut dst = anchorlist.last().unwrap().to_string();
+    let dst = anchorlist.last().unwrap().to_string();
     let mut haplotype = Vec::new();
     // println!("{}", src);
     let mut entity_set = HashSet::new();
diff --git a/src/asm.rs b/src/asm.rs
@@ -4,7 +4,7 @@ use std::{path::PathBuf, fs::File, io::{self, Write}};
 
 
 fn write_fasta(outputfile: &PathBuf, sequence: String, header: &str) -> io::Result<()>{
-    let mut index = 0;
+    let index = 0;
     let mut file = File::create(outputfile)?;
 
     let header = format!(">{} \n", header);
diff --git a/src/build.rs b/src/build.rs
@@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::error::Error;
 use std::fs::File;
-use std::io::{self, BufRead, BufReader, Write};
+use std::io::{self, BufRead, Write};
 
 
 pub fn reverse_complement(kmer: &str) -> String {
@@ -334,7 +334,7 @@ pub fn create_edge_file(
                 edgeindex += 1;
             }
             // debuging why edge sequence are empty
-            let mut edge_seq = if src_pos == 0 {
+            let edge_seq = if src_pos == 0 {
             let seq = contig.get(0..dst_pos).unwrap_or_default().to_string();
                 if seq.is_empty() {
                     println!("Warning: Empty edge sequence created for SOURCE. dst_pos: {}, contig_len: {}", dst_pos, contig.len());
diff --git a/src/callnumts.rs b/src/callnumts.rs
@@ -0,0 +1,241 @@
+use std::{collections::HashMap, path::PathBuf, io::Write};
+use rust_htslib::bam::{Read,IndexedReader, record::Aux};
+use bio::io::fasta::{Reader as FastaReader, Record as FastaRecord};
+
+
+
+pub fn find_numts(bam_file: &PathBuf, chromo: &str) -> Result<Vec<(String, i32, String, String, i32, String, String)>, Box<dyn std::error::Error>> {
+    let mut numts_mapping_info: Vec<(String, i32, String, String, i32, String, String)> = Vec::new();
+    let mut bam = IndexedReader::from_path(bam_file)?;
+
+    // Get the chromosome ID from the header
+    let tid = bam.header().tid(chromo.as_bytes())
+        .ok_or("Chromosome not found in BAM header")?;
+    
+    // Get the chromosome length
+    let chrom_length = bam.header().target_len(tid)
+        .ok_or("Could not get chromosome length")?;
+
+
+    // Set the region to fetch
+    bam.fetch((tid, 0, chrom_length))?;
+    let header = bam.header().clone();
+    
+    for read in bam.records() {
+        let record = read?;
+        // Skip unmapped reads
+        if record.is_unmapped() {
+            continue;
+        }
+        // Get the target ID (tid) for the primary alignment
+        let primary_tid = record.tid();
+        // Convert tid to chromosome name using the header
+        let primary_mapping_chromosome = String::from_utf8_lossy(&header.tid2name(primary_tid as u32)).to_string();
+        // Get the primary alignment position (0-based)
+        let primary_mapping_position = record.pos();
+        // Get the primary alignment strand
+        let primary_mapping_strand = record.strand().to_string();
+        // Check for supplementary alignments
+        if let Ok(sa_tag) = record.aux(b"SA") {
+            if let Aux::String(sa_str) = sa_tag {
+                let supplementary_positions: Vec<&str> = sa_str.split(';')
+                    .filter(|s| !s.is_empty())
+                    .collect();                
+                for sa in supplementary_positions {
+                    let parts: Vec<&str> = sa.split(',').collect();
+                    // println!("{:?}", parts);
+                    if parts.len() >= 6 {
+                        let chrom = parts[0];
+                        let pos = parts[1].parse::<i32>().unwrap();
+                        let strand = parts[2].parse::<char>().unwrap();
+                        let cigar = parts[3];
+                        if chrom != chromo.to_string() {
+                            numts_mapping_info.push((primary_mapping_chromosome.clone(), primary_mapping_position as i32, primary_mapping_strand.to_string(), chrom.to_string(), pos, strand.to_string(), cigar.to_string()));
+                        }
+                    }
+                }
+            }
+        }
+
+    }
+    
+    Ok(numts_mapping_info)
+}
+
+/// Merges consecutive values that are within max_gap of each other into intervals
+/// Returns a vector of (start, end) tuples representing merged intervals
+pub fn merge_by_gap(vals: &[(i32, String, String, i32, String)], max_gap: i32) -> Vec<((i32, String, String, i32, String), (i32, String, String, i32, String), i32)> {
+    if vals.is_empty() {
+        return Vec::new();
+    }
+    
+    // Sort and deduplicate (equivalent to sorted(set(vals)))
+    let mut sorted_vals: Vec<(i32, String, String, i32, String)> = vals.to_vec();
+    sorted_vals.sort_by_key(|k| k.0);
+    sorted_vals.dedup();
+    
+    let mut intervals = Vec::new();
+    let mut start = sorted_vals[0].clone();
+    let mut prev = sorted_vals[0].clone();
+    
+    // Iterate through sorted values starting from the second one
+    let mut count = 0;
+    for x in sorted_vals.clone().iter().skip(1) {
+        if x.0 - prev.0 <= max_gap {
+            // Extend the current interval
+            prev = x.clone();
+            count += 1;
+        } else {
+            // Save the current interval and start a new one
+            intervals.push((start.clone(), prev.clone(), count+1));
+            start = x.clone();
+            prev = x.clone();
+            count = 0;
+        }
+    }
+    intervals.push((start, prev, count + 1));
+    
+    intervals
+}
+/// Groups breakpoints by chromosome and merges positions within max_gap_threshold
+/// Returns intervals as (chromosome, start, end) tuples
+pub fn get_numts_intervals(
+    numts_mapping_info: &[(String, i32, String, String, i32, String, String)], 
+    max_gap_threshold: i32
+) -> Vec<(String, (i32, String, String, i32, String), (i32, String, String, i32, String), i32)> {
+    let mut numts_intervals: Vec<(String, (i32, String, String, i32, String), (i32, String, String, i32, String), i32)> = Vec::new();
+    let mut stack: HashMap<String, Vec<(i32, String, String, i32, String)>> = HashMap::new();
+    
+    // Group positions by chromosome
+    for (_mt_chrom, _mt_start, _mt_strand, numt_chrom, numt_start, _numt_strand, _cigar) in numts_mapping_info {
+        stack.entry(numt_chrom.clone())
+            .or_insert_with(Vec::new)
+            .push((*numt_start as i32,_numt_strand.clone(), _mt_chrom.clone(), *_mt_start, _mt_strand.clone()));
+    }
+    
+    // Merge positions by gap for each chromosome and create intervals
+    for (chrom, positionlist) in stack {
+        let merged = merge_by_gap(&positionlist, max_gap_threshold);
+        for (start, end, counts) in merged {
+            numts_intervals.push((chrom.clone(), start, end, counts));
+        }
+    }
+    numts_intervals
+}
+
+/// Formats BND ALT field based on strand orientations
+/// Returns the BND ALT string in VCF format
+fn format_bnd_alt(
+    ref_base: &str,
+    mt_chrom: &str,
+    mt_pos: i32,
+    mt_strand: &str,
+    sa_chrom: &str,
+    sa_pos: i32,
+    sa_strand: &str,
+) -> String {
+    // BND format: 
+    // - N[chr2:pos2[ means N connects to chr2:pos2 from left (both forward)
+    // - N]chr2:pos2] means N connects to chr2:pos2 from right (primary forward, SA reverse)
+    // - ]chr2:pos2]N means chr2:pos2 connects to N from left (primary reverse, SA forward)
+    // - [chr2:pos2[N means chr2:pos2 connects to N from right (both reverse)
+    
+    let primary_forward = sa_strand == "+";
+    let mt_forward = mt_strand == "+";
+    
+    if primary_forward && mt_forward {
+        // Both forward: N[chr2:pos2[
+        format!("{}[{}:{}[", ref_base, mt_chrom, mt_pos)
+    } else if primary_forward && !mt_forward {
+        // Primary forward, SA reverse: N]chr2:pos2]
+        format!("{}]{}:{}]", ref_base, mt_chrom, mt_pos)
+    } else if !primary_forward && mt_forward {
+        // Primary reverse, SA forward: ]chr2:pos2]N
+        format!("]{}:{}]{}", mt_chrom, mt_pos, ref_base)
+    } else {
+        // Both reverse: [chr2:pos2[N
+        format!("[{}:{}[{}", mt_chrom, mt_pos, ref_base)
+    }
+}
+
+/// Writes BND records to a VCF file from SA tag information
+pub fn write_bnd_vcf(
+    output_vcf: &PathBuf,
+    numts_intervals: &Vec<(String, (i32, String, String, i32, String), (i32, String, String, i32, String), i32)>,
+    reference_file: &PathBuf,
+    sample_name: &str,
+    minimal_ac: i32,
+) -> Result<(), Box<dyn std::error::Error>> {
+    use std::fs::File;
+    let mut vcf_file = File::create(output_vcf)?;
+    
+    // reference fasta information
+    let ref_reader = FastaReader::from_file(reference_file).unwrap();
+    let reference_sequence: Vec<FastaRecord> = ref_reader.records().map(|r| r.unwrap()).collect();
+    let ref_seq = String::from_utf8_lossy(reference_sequence[0].seq()).to_string();
+    let ref_header = reference_sequence[0].id().to_string();
+    
+    // Write VCF header
+    writeln!(vcf_file, "##fileformat=VCFv4.3")?;
+    for record in reference_sequence.iter() {
+        let referencename = record.id().to_string();
+        let referencelength = record.seq().len() as u64;
+        writeln!(vcf_file, "##contig=<ID={},length={}>", referencename, referencelength)?;
+    }
+    writeln!(vcf_file, "##ALT=<ID=BND,Description=\"NUMTs breakpoints\">")?;
+    writeln!(vcf_file, "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">")?;
+    writeln!(vcf_file,  "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">")?;
+    writeln!(vcf_file,   "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Supporting Read Counts\">")?;
+    writeln!(vcf_file, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}", sample_name)?;
+    
+    // Write BND records
+    for (auto_chrom, (auto_start, auto_strand, mt_chrom, mt_pos, mt_strand), (auto_end, auto_end_strand, mt_chrom_end, mt_pos_end, mt_strand_end), counts) in numts_intervals {
+        // VCF uses 1-based positions
+        let pos = auto_start + 1;
+        let ref_seq = reference_sequence
+        .iter()
+        .find(|r| r.id() == auto_chrom)
+        .unwrap()
+        .seq();
+        let ref_seq_ = String::from_utf8_lossy(ref_seq).to_string();
+        let ref_base = ref_seq_[(pos - 1) as usize..pos as usize].to_string();
+        // Format BND ALT field
+        let alt = format_bnd_alt(
+            &ref_base,
+            mt_chrom,
+            *mt_pos,
+            mt_strand,
+            auto_chrom,
+            *auto_start,
+            auto_strand,
+        );
+        
+        // Create variant ID
+        let variant_id = format!("BND_{}_{}_{}_{}_{}_{}", auto_chrom, auto_start, auto_strand, mt_chrom, mt_pos, mt_strand);
+        
+        // INFO field
+        let info = "SVTYPE=BND".to_string();
+        
+        // Write the record
+        if counts >= &minimal_ac {
+            writeln!(
+                vcf_file,
+                "{}\t{}\t{}\t{}\t{}\t.\t.\t{}\tGT:AD\t0/1:{}",
+                auto_chrom, pos, ref_base, variant_id, alt, info, counts
+            )?;
+        }
+        
+    }
+    
+    Ok(())
+}
+
+pub fn start(input_bam:&PathBuf, chromo: &str, max_gap_threshold: i32, output_vcf:&PathBuf, reference_file:&PathBuf, sample_name:&str, minimal_ac: i32) -> Result<(), Box<dyn std::error::Error>> {
+    
+    let numts_mapping =  find_numts(input_bam, chromo).expect("Error finding numts");
+    let numts_intervals = get_numts_intervals(&numts_mapping, max_gap_threshold);
+
+    write_bnd_vcf(&output_vcf, &numts_intervals, &reference_file, &sample_name, minimal_ac).expect("Error writing BND VCF");
+    
+    Ok(())
+}
diff --git a/src/correct.rs b/src/correct.rs
@@ -1,14 +1,10 @@
 // correct graph based on srWGS data
-use std::process::Command;
-use msbwt2::dynamic_bwt::{create_from_fastx,DynamicBWT};
 use msbwt2::msbwt_core::BWT;
 use msbwt2::string_util;
 use msbwt2::rle_bwt::RleBWT;
-use rust_htslib::htslib::fai_load_options_FAI_CREATE;
 
 use crate::{agg::*};
-use std::{path::PathBuf, fs::File, io::{self, Write}};
-use std::collections::{HashMap, HashSet};
+use std::path::PathBuf;
 use bio::io::fasta::{Reader, Record};
 
 
diff --git a/src/filter.rs b/src/filter.rs
@@ -1,6 +1,6 @@
 
 use std::{collections::HashSet, path::PathBuf};
-use rust_htslib::bam::{self, Read, Writer,IndexedReader, Header, Record, record::{Aux, AuxArray}};
+use rust_htslib::bam::{self, Read,IndexedReader, Header, record::Aux};
 
 
 
diff --git a/src/main.rs b/src/main.rs
diff --git a/src/methyl.rs b/src/methyl.rs
diff --git a/src/minorhap.rs b/src/minorhap.rs

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,6 @@`
`1`	`1`
`2`	`2`	`use std::{collections::HashSet, path::PathBuf};`
`3`		`-use rust_htslib::bam::{self, Read, Writer,IndexedReader, Header, Record, record::{Aux, AuxArray}};`
	`3`	`+use rust_htslib::bam::{self, Read,IndexedReader, Header, record::Aux};`
`4`	`4`
`5`	`5`
`6`	`6`