Skip to content

Commit aacd30d

Browse files
authored
added NUMTs breakends caller (#22)
* added NUMTs breakends caller * added README call-numts * added version
1 parent 0aea521 commit aacd30d

File tree

12 files changed

+332
-20
lines changed

12 files changed

+332
-20
lines changed

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[package]
22
name = "Himito"
3-
version = "0.1.0"
3+
version = "1.0.0"
44
edition = "2021"
55

66
[dependencies]

README.md

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,3 +63,41 @@ msbwt2-build -o sr_msbwt.npy <srWGS.chrM.fasta.gz>
6363
# enumerate all possible haplotypes within windows
6464
./target/release/Himito minorhap -g <output.gfa> -o <output.allhaplotype.fasta> -s <sample_id>
6565
```
66+
67+
### Jan 20th 2026 Updates: NUMTs Breakpoint Calling
68+
69+
Himito can identify and call NUMTs (Nuclear Mitochondrial DNA segments) breakpoints from long-read BAM files. NUMTs are segments of mitochondrial DNA that have been inserted into the nuclear genome. The NUMTs calling functionality (`callnumts.rs`) works by:
70+
71+
1. **Identifying NUMT reads**: Parses supplementary alignment (SA) tags in BAM files to identify reads that have:
72+
- Primary alignment to mitochondrial DNA (chrM) and supplementary alignment to nuclear chromosomes, OR
73+
- Primary alignment to nuclear chromosomes and supplementary alignment to mitochondrial DNA
74+
75+
2. **Merging breakpoints**: Groups breakpoints by chromosome and merges consecutive breakpoints within a specified gap threshold (`max_gap_threshold`) to create intervals. This helps reduce noise and identify true NUMT insertion sites.
76+
77+
3. **Writing BND records**: Outputs breakend (BND) structural variant records in VCF format, representing the NUMT breakpoints. Each BND record includes:
78+
- Breakpoint positions on both the nuclear chromosome and mitochondrial genome
79+
- Strand orientation information
80+
- Supporting read counts (allele depth)
81+
- Properly formatted BND ALT fields based on strand orientations
82+
83+
**Usage:**
84+
```bash
85+
./target/release/Himito call-numts -i sample.bam \
86+
-c chrM \
87+
-m 10000 \
88+
-r hg38.fa \
89+
-o numts_breakpoints.vcf \
90+
-s HG002 \
91+
-a 2
92+
```
93+
**Output format:**
94+
The output VCF file contains BND (breakend) records in standard VCFv4.3 format. Each record represents a NUMT breakpoint with:
95+
- `CHROM`: Nuclear chromosome where the breakpoint occurs
96+
- `POS`: Breakpoint position (1-based)
97+
- `ALT`: BND format string indicating the connection to mitochondrial DNA
98+
- `INFO`: Contains `SVTYPE=BND`
99+
- `FORMAT`: GT (genotype) and AD (allele depth/supporting read count)
100+
101+
This will identify NUMT breakpoints where reads have alignments spanning both mitochondrial and nuclear genomes, merge breakpoints within 10kb, and output only those with at least 2 supporting reads. If you want to examine all possible NUMTs BND, adjust -a to <=1.
102+
103+
ToDo: may perform local assembly to find sequence resolved NUMTs insertions

src/agg.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ pub fn construct_major_haplotype(graph:&GraphicalGenome) -> String {
418418
anchorlist.sort();
419419
let mut src = anchorlist.first().unwrap().to_string();
420420
let mut next_edge = "".to_string();
421-
let mut dst = anchorlist.last().unwrap().to_string();
421+
let dst = anchorlist.last().unwrap().to_string();
422422
let mut haplotype = String::new();
423423
// println!("{}", src);
424424
let mut entity_set = HashSet::new();
@@ -475,7 +475,7 @@ pub fn construct_major_haplotype_entitylist(graph:&GraphicalGenome) -> Vec<Strin
475475
anchorlist.sort();
476476
let mut src = anchorlist.first().unwrap().to_string();
477477
let mut next_edge = "".to_string();
478-
let mut dst = anchorlist.last().unwrap().to_string();
478+
let dst = anchorlist.last().unwrap().to_string();
479479
let mut haplotype = Vec::new();
480480
// println!("{}", src);
481481
let mut entity_set = HashSet::new();

src/asm.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use std::{path::PathBuf, fs::File, io::{self, Write}};
44

55

66
fn write_fasta(outputfile: &PathBuf, sequence: String, header: &str) -> io::Result<()>{
7-
let mut index = 0;
7+
let index = 0;
88
let mut file = File::create(outputfile)?;
99

1010
let header = format!(">{} \n", header);

src/build.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize};
88
use std::collections::HashMap;
99
use std::error::Error;
1010
use std::fs::File;
11-
use std::io::{self, BufRead, BufReader, Write};
11+
use std::io::{self, BufRead, Write};
1212

1313

1414
pub fn reverse_complement(kmer: &str) -> String {
@@ -334,7 +334,7 @@ pub fn create_edge_file(
334334
edgeindex += 1;
335335
}
336336
// debuging why edge sequence are empty
337-
let mut edge_seq = if src_pos == 0 {
337+
let edge_seq = if src_pos == 0 {
338338
let seq = contig.get(0..dst_pos).unwrap_or_default().to_string();
339339
if seq.is_empty() {
340340
println!("Warning: Empty edge sequence created for SOURCE. dst_pos: {}, contig_len: {}", dst_pos, contig.len());

src/callnumts.rs

Lines changed: 241 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,241 @@
1+
use std::{collections::HashMap, path::PathBuf, io::Write};
2+
use rust_htslib::bam::{Read,IndexedReader, record::Aux};
3+
use bio::io::fasta::{Reader as FastaReader, Record as FastaRecord};
4+
5+
6+
7+
pub fn find_numts(bam_file: &PathBuf, chromo: &str) -> Result<Vec<(String, i32, String, String, i32, String, String)>, Box<dyn std::error::Error>> {
8+
let mut numts_mapping_info: Vec<(String, i32, String, String, i32, String, String)> = Vec::new();
9+
let mut bam = IndexedReader::from_path(bam_file)?;
10+
11+
// Get the chromosome ID from the header
12+
let tid = bam.header().tid(chromo.as_bytes())
13+
.ok_or("Chromosome not found in BAM header")?;
14+
15+
// Get the chromosome length
16+
let chrom_length = bam.header().target_len(tid)
17+
.ok_or("Could not get chromosome length")?;
18+
19+
20+
// Set the region to fetch
21+
bam.fetch((tid, 0, chrom_length))?;
22+
let header = bam.header().clone();
23+
24+
for read in bam.records() {
25+
let record = read?;
26+
// Skip unmapped reads
27+
if record.is_unmapped() {
28+
continue;
29+
}
30+
// Get the target ID (tid) for the primary alignment
31+
let primary_tid = record.tid();
32+
// Convert tid to chromosome name using the header
33+
let primary_mapping_chromosome = String::from_utf8_lossy(&header.tid2name(primary_tid as u32)).to_string();
34+
// Get the primary alignment position (0-based)
35+
let primary_mapping_position = record.pos();
36+
// Get the primary alignment strand
37+
let primary_mapping_strand = record.strand().to_string();
38+
// Check for supplementary alignments
39+
if let Ok(sa_tag) = record.aux(b"SA") {
40+
if let Aux::String(sa_str) = sa_tag {
41+
let supplementary_positions: Vec<&str> = sa_str.split(';')
42+
.filter(|s| !s.is_empty())
43+
.collect();
44+
for sa in supplementary_positions {
45+
let parts: Vec<&str> = sa.split(',').collect();
46+
// println!("{:?}", parts);
47+
if parts.len() >= 6 {
48+
let chrom = parts[0];
49+
let pos = parts[1].parse::<i32>().unwrap();
50+
let strand = parts[2].parse::<char>().unwrap();
51+
let cigar = parts[3];
52+
if chrom != chromo.to_string() {
53+
numts_mapping_info.push((primary_mapping_chromosome.clone(), primary_mapping_position as i32, primary_mapping_strand.to_string(), chrom.to_string(), pos, strand.to_string(), cigar.to_string()));
54+
}
55+
}
56+
}
57+
}
58+
}
59+
60+
}
61+
62+
Ok(numts_mapping_info)
63+
}
64+
65+
/// Merges consecutive values that are within max_gap of each other into intervals
66+
/// Returns a vector of (start, end) tuples representing merged intervals
67+
pub fn merge_by_gap(vals: &[(i32, String, String, i32, String)], max_gap: i32) -> Vec<((i32, String, String, i32, String), (i32, String, String, i32, String), i32)> {
68+
if vals.is_empty() {
69+
return Vec::new();
70+
}
71+
72+
// Sort and deduplicate (equivalent to sorted(set(vals)))
73+
let mut sorted_vals: Vec<(i32, String, String, i32, String)> = vals.to_vec();
74+
sorted_vals.sort_by_key(|k| k.0);
75+
sorted_vals.dedup();
76+
77+
let mut intervals = Vec::new();
78+
let mut start = sorted_vals[0].clone();
79+
let mut prev = sorted_vals[0].clone();
80+
81+
// Iterate through sorted values starting from the second one
82+
let mut count = 0;
83+
for x in sorted_vals.clone().iter().skip(1) {
84+
if x.0 - prev.0 <= max_gap {
85+
// Extend the current interval
86+
prev = x.clone();
87+
count += 1;
88+
} else {
89+
// Save the current interval and start a new one
90+
intervals.push((start.clone(), prev.clone(), count+1));
91+
start = x.clone();
92+
prev = x.clone();
93+
count = 0;
94+
}
95+
}
96+
intervals.push((start, prev, count + 1));
97+
98+
intervals
99+
}
100+
/// Groups breakpoints by chromosome and merges positions within max_gap_threshold
101+
/// Returns intervals as (chromosome, start, end) tuples
102+
pub fn get_numts_intervals(
103+
numts_mapping_info: &[(String, i32, String, String, i32, String, String)],
104+
max_gap_threshold: i32
105+
) -> Vec<(String, (i32, String, String, i32, String), (i32, String, String, i32, String), i32)> {
106+
let mut numts_intervals: Vec<(String, (i32, String, String, i32, String), (i32, String, String, i32, String), i32)> = Vec::new();
107+
let mut stack: HashMap<String, Vec<(i32, String, String, i32, String)>> = HashMap::new();
108+
109+
// Group positions by chromosome
110+
for (_mt_chrom, _mt_start, _mt_strand, numt_chrom, numt_start, _numt_strand, _cigar) in numts_mapping_info {
111+
stack.entry(numt_chrom.clone())
112+
.or_insert_with(Vec::new)
113+
.push((*numt_start as i32,_numt_strand.clone(), _mt_chrom.clone(), *_mt_start, _mt_strand.clone()));
114+
}
115+
116+
// Merge positions by gap for each chromosome and create intervals
117+
for (chrom, positionlist) in stack {
118+
let merged = merge_by_gap(&positionlist, max_gap_threshold);
119+
for (start, end, counts) in merged {
120+
numts_intervals.push((chrom.clone(), start, end, counts));
121+
}
122+
}
123+
numts_intervals
124+
}
125+
126+
/// Formats BND ALT field based on strand orientations
127+
/// Returns the BND ALT string in VCF format
128+
fn format_bnd_alt(
129+
ref_base: &str,
130+
mt_chrom: &str,
131+
mt_pos: i32,
132+
mt_strand: &str,
133+
sa_chrom: &str,
134+
sa_pos: i32,
135+
sa_strand: &str,
136+
) -> String {
137+
// BND format:
138+
// - N[chr2:pos2[ means N connects to chr2:pos2 from left (both forward)
139+
// - N]chr2:pos2] means N connects to chr2:pos2 from right (primary forward, SA reverse)
140+
// - ]chr2:pos2]N means chr2:pos2 connects to N from left (primary reverse, SA forward)
141+
// - [chr2:pos2[N means chr2:pos2 connects to N from right (both reverse)
142+
143+
let primary_forward = sa_strand == "+";
144+
let mt_forward = mt_strand == "+";
145+
146+
if primary_forward && mt_forward {
147+
// Both forward: N[chr2:pos2[
148+
format!("{}[{}:{}[", ref_base, mt_chrom, mt_pos)
149+
} else if primary_forward && !mt_forward {
150+
// Primary forward, SA reverse: N]chr2:pos2]
151+
format!("{}]{}:{}]", ref_base, mt_chrom, mt_pos)
152+
} else if !primary_forward && mt_forward {
153+
// Primary reverse, SA forward: ]chr2:pos2]N
154+
format!("]{}:{}]{}", mt_chrom, mt_pos, ref_base)
155+
} else {
156+
// Both reverse: [chr2:pos2[N
157+
format!("[{}:{}[{}", mt_chrom, mt_pos, ref_base)
158+
}
159+
}
160+
161+
/// Writes BND records to a VCF file from SA tag information
162+
pub fn write_bnd_vcf(
163+
output_vcf: &PathBuf,
164+
numts_intervals: &Vec<(String, (i32, String, String, i32, String), (i32, String, String, i32, String), i32)>,
165+
reference_file: &PathBuf,
166+
sample_name: &str,
167+
minimal_ac: i32,
168+
) -> Result<(), Box<dyn std::error::Error>> {
169+
use std::fs::File;
170+
let mut vcf_file = File::create(output_vcf)?;
171+
172+
// reference fasta information
173+
let ref_reader = FastaReader::from_file(reference_file).unwrap();
174+
let reference_sequence: Vec<FastaRecord> = ref_reader.records().map(|r| r.unwrap()).collect();
175+
let ref_seq = String::from_utf8_lossy(reference_sequence[0].seq()).to_string();
176+
let ref_header = reference_sequence[0].id().to_string();
177+
178+
// Write VCF header
179+
writeln!(vcf_file, "##fileformat=VCFv4.3")?;
180+
for record in reference_sequence.iter() {
181+
let referencename = record.id().to_string();
182+
let referencelength = record.seq().len() as u64;
183+
writeln!(vcf_file, "##contig=<ID={},length={}>", referencename, referencelength)?;
184+
}
185+
writeln!(vcf_file, "##ALT=<ID=BND,Description=\"NUMTs breakpoints\">")?;
186+
writeln!(vcf_file, "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">")?;
187+
writeln!(vcf_file, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">")?;
188+
writeln!(vcf_file, "##FORMAT=<ID=AD,Number=1,Type=Integer,Description=\"Supporting Read Counts\">")?;
189+
writeln!(vcf_file, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}", sample_name)?;
190+
191+
// Write BND records
192+
for (auto_chrom, (auto_start, auto_strand, mt_chrom, mt_pos, mt_strand), (auto_end, auto_end_strand, mt_chrom_end, mt_pos_end, mt_strand_end), counts) in numts_intervals {
193+
// VCF uses 1-based positions
194+
let pos = auto_start + 1;
195+
let ref_seq = reference_sequence
196+
.iter()
197+
.find(|r| r.id() == auto_chrom)
198+
.unwrap()
199+
.seq();
200+
let ref_seq_ = String::from_utf8_lossy(ref_seq).to_string();
201+
let ref_base = ref_seq_[(pos - 1) as usize..pos as usize].to_string();
202+
// Format BND ALT field
203+
let alt = format_bnd_alt(
204+
&ref_base,
205+
mt_chrom,
206+
*mt_pos,
207+
mt_strand,
208+
auto_chrom,
209+
*auto_start,
210+
auto_strand,
211+
);
212+
213+
// Create variant ID
214+
let variant_id = format!("BND_{}_{}_{}_{}_{}_{}", auto_chrom, auto_start, auto_strand, mt_chrom, mt_pos, mt_strand);
215+
216+
// INFO field
217+
let info = "SVTYPE=BND".to_string();
218+
219+
// Write the record
220+
if counts >= &minimal_ac {
221+
writeln!(
222+
vcf_file,
223+
"{}\t{}\t{}\t{}\t{}\t.\t.\t{}\tGT:AD\t0/1:{}",
224+
auto_chrom, pos, ref_base, variant_id, alt, info, counts
225+
)?;
226+
}
227+
228+
}
229+
230+
Ok(())
231+
}
232+
233+
pub fn start(input_bam:&PathBuf, chromo: &str, max_gap_threshold: i32, output_vcf:&PathBuf, reference_file:&PathBuf, sample_name:&str, minimal_ac: i32) -> Result<(), Box<dyn std::error::Error>> {
234+
235+
let numts_mapping = find_numts(input_bam, chromo).expect("Error finding numts");
236+
let numts_intervals = get_numts_intervals(&numts_mapping, max_gap_threshold);
237+
238+
write_bnd_vcf(&output_vcf, &numts_intervals, &reference_file, &sample_name, minimal_ac).expect("Error writing BND VCF");
239+
240+
Ok(())
241+
}

src/correct.rs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,10 @@
11
// correct graph based on srWGS data
2-
use std::process::Command;
3-
use msbwt2::dynamic_bwt::{create_from_fastx,DynamicBWT};
42
use msbwt2::msbwt_core::BWT;
53
use msbwt2::string_util;
64
use msbwt2::rle_bwt::RleBWT;
7-
use rust_htslib::htslib::fai_load_options_FAI_CREATE;
85

96
use crate::{agg::*};
10-
use std::{path::PathBuf, fs::File, io::{self, Write}};
11-
use std::collections::{HashMap, HashSet};
7+
use std::path::PathBuf;
128
use bio::io::fasta::{Reader, Record};
139

1410

src/filter.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11

22
use std::{collections::HashSet, path::PathBuf};
3-
use rust_htslib::bam::{self, Read, Writer,IndexedReader, Header, Record, record::{Aux, AuxArray}};
3+
use rust_htslib::bam::{self, Read,IndexedReader, Header, record::Aux};
44

55

66

0 commit comments

Comments
 (0)