Skip to content

Commit 337b89e

Browse files
Merge pull request #62 from pangenome/merge_bed_format
`impg query`: Add merge logic for BED output
2 parents 96bc942 + b842d08 commit 337b89e

File tree

2 files changed

+60
-9
lines changed

2 files changed

+60
-9
lines changed

src/main.rs

Lines changed: 57 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,15 @@ enum Args {
116116
/// Output format: 'auto' (BED for -r, BEDPE for -b), 'bed', 'bedpe', or 'paf'
117117
#[clap(short = 'o', long, value_parser, default_value = "auto")]
118118
output_format: String,
119-
119+
120+
/// Maximum distance between regions to merge for BED output
121+
#[clap(short = 'd', long, value_parser, conflicts_with = "no_merge_bed", default_value_t = 0)]
122+
merge_distance: i32,
123+
124+
/// Disable merging for BED output
125+
#[clap(long, action, conflicts_with = "merge_distance")]
126+
no_merge_bed: bool,
127+
120128
/// Enable transitive queries
121129
#[clap(short = 'x', long, action)]
122130
transitive: bool,
@@ -182,6 +190,8 @@ fn main() -> io::Result<()> {
182190
target_range,
183191
target_bed,
184192
output_format,
193+
merge_distance,
194+
no_merge_bed,
185195
transitive,
186196
transitive_bfs,
187197
max_depth,
@@ -194,7 +204,7 @@ fn main() -> io::Result<()> {
194204

195205
if let Some(target_range) = target_range {
196206
let (target_name, target_range) = parse_target_range(&target_range)?;
197-
let results = perform_query(
207+
let mut results = perform_query(
198208
&impg,
199209
&target_name,
200210
target_range,
@@ -216,15 +226,17 @@ fn main() -> io::Result<()> {
216226
output_results_paf(&impg, results.into_iter().skip(1), None);
217227
},
218228
_ => { // 'auto' or 'bed'
229+
let merge_distance = if no_merge_bed { -1 } else { merge_distance };
230+
219231
// BED format - include the first element
220-
output_results_bed(&impg, results);
232+
output_results_bed(&impg, &mut results, merge_distance);
221233
}
222234
}
223235
} else if let Some(target_bed) = target_bed {
224236
let targets = parse_bed_file(&target_bed)?;
225237
info!("Parsed {} target ranges from BED file", targets.len());
226238
for (target_name, target_range, name) in targets {
227-
let results = perform_query(
239+
let mut results = perform_query(
228240
&impg,
229241
&target_name,
230242
target_range,
@@ -238,8 +250,10 @@ fn main() -> io::Result<()> {
238250
// Output results based on the format
239251
match output_format.as_str() {
240252
"bed" => {
253+
let merge_distance = if no_merge_bed { -1 } else { merge_distance };
254+
241255
// BED format - include the first element
242-
output_results_bed(&impg, results);
256+
output_results_bed(&impg, &mut results, merge_distance);
243257
},
244258
"paf" => {
245259
// Skip the first element (the input range) for PAF output
@@ -604,7 +618,9 @@ fn perform_query(
604618
}
605619
}
606620

607-
fn output_results_bed(impg: &Impg, results: Vec<AdjustedInterval>) {
621+
fn output_results_bed(impg: &Impg, results: &mut Vec<AdjustedInterval>, merge_distance: i32) {
622+
merge_bed_intervals(results, merge_distance);
623+
608624
for (overlap, _, _) in results {
609625
let overlap_name = impg.seq_index.get_name(overlap.metadata).unwrap();
610626
let (first, last, strand) = if overlap.first <= overlap.last {
@@ -616,6 +632,41 @@ fn output_results_bed(impg: &Impg, results: Vec<AdjustedInterval>) {
616632
}
617633
}
618634

635+
// New helper function to merge BED intervals
636+
fn merge_bed_intervals(results: &mut Vec<AdjustedInterval>, merge_distance: i32) {
637+
if results.len() > 1 && merge_distance >= 0 {
638+
// Sort by sequence ID and start position
639+
results.par_sort_by_key(|(query_interval, _, _)| {
640+
(
641+
query_interval.metadata,
642+
std::cmp::min(query_interval.first, query_interval.last),
643+
)
644+
});
645+
646+
let mut write_idx = 0;
647+
for read_idx in 1..results.len() {
648+
let (curr_interval, _, _) = &results[write_idx];
649+
let (next_interval, _, _) = &results[read_idx];
650+
651+
let curr_min = std::cmp::min(curr_interval.first, curr_interval.last);
652+
let curr_max = std::cmp::max(curr_interval.first, curr_interval.last);
653+
let next_min = std::cmp::min(next_interval.first, next_interval.last);
654+
let next_max = std::cmp::max(next_interval.first, next_interval.last);
655+
656+
if curr_interval.metadata != next_interval.metadata || next_min > curr_max + merge_distance {
657+
write_idx += 1;
658+
if write_idx != read_idx {
659+
results.swap(write_idx, read_idx);
660+
}
661+
} else {
662+
results[write_idx].0.first = curr_min.min(next_min);
663+
results[write_idx].0.last = curr_max.max(next_max);
664+
}
665+
}
666+
results.truncate(write_idx + 1);
667+
}
668+
}
669+
619670
fn output_results_bedpe<I>(impg: &Impg, results: I, name: Option<String>)
620671
where
621672
I: Iterator<Item = AdjustedInterval>,

src/partition.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -561,8 +561,8 @@ fn select_and_window_sequences(
561561
Ok(())
562562
}
563563

564-
fn merge_overlaps(overlaps: &mut Vec<(Interval<u32>, Vec<CigarOp>, Interval<u32>)>, max_gap: i32) {
565-
if overlaps.len() > 1 && max_gap >= 0 {
564+
fn merge_overlaps(overlaps: &mut Vec<(Interval<u32>, Vec<CigarOp>, Interval<u32>)>, merge_distance: i32) {
565+
if overlaps.len() > 1 && merge_distance >= 0 {
566566
// Sort by sequence ID and start position
567567
overlaps.par_sort_by_key(|(query_interval, _, _)| {
568568
(
@@ -581,7 +581,7 @@ fn merge_overlaps(overlaps: &mut Vec<(Interval<u32>, Vec<CigarOp>, Interval<u32>
581581
let next_min = std::cmp::min(next_interval.first, next_interval.last);
582582
let next_max = std::cmp::max(next_interval.first, next_interval.last);
583583

584-
if curr_interval.metadata != next_interval.metadata || next_min > curr_max + max_gap {
584+
if curr_interval.metadata != next_interval.metadata || next_min > curr_max + merge_distance {
585585
write_idx += 1;
586586
if write_idx != read_idx {
587587
overlaps.swap(write_idx, read_idx);

0 commit comments

Comments
 (0)