Skip to content

Commit b6868ac

Browse files
Merge pull request #98 from pangenome/impg-partition-fewer-files
`impg partition`: emit a single BED file by default
2 parents cbba0f9 + 371d225 commit b6868ac

File tree

3 files changed

+145
-44
lines changed

3 files changed

+145
-44
lines changed

README.md

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -103,9 +103,12 @@ impg query -p alignments.paf -r chr1:1000-2000 --transitive-dfs
103103
Partition the alignment into smaller pieces:
104104

105105
```bash
106-
# Basic partitioning with 1Mb windows
106+
# Basic partitioning with 1Mb windows (outputs single partitions.bed file with partition number in 4th column)
107107
impg partition -p alignments.paf -w 1000000
108108

109+
# Output separate files for each partition
110+
impg partition -p alignments.paf -w 1000000 --separate-files
111+
109112
# Specify output folder for partition files (directory will be created if it doesn't exist)
110113
impg partition -p alignments.paf -w 1000000 --output-folder results
111114

@@ -124,13 +127,13 @@ impg partition -p alignments.paf -w 1000000 --selection-mode haplotype # by
124127
# Control transitive search depth and minimum sizes
125128
impg partition -p alignments.paf -w 1000000 -m 2 -l 10000
126129

127-
# Output as GFA, MAF or FASTA requires sequence files (--sequence-files or --sequence-list)
128-
impg partition -p alignments.paf -w 1000000 -o gfa --sequence-files *.fa --output-folder gfa_partitions
129-
impg partition -p alignments.paf -w 1000000 -o maf --sequence-list fastas.txt --output-folder maf_partitions
130-
impg partition -p alignments.paf -w 1000000 -o fasta --sequence-files *.fa --output-folder fasta_partitions
130+
# Output as GFA, MAF or FASTA requires sequence files and --separate-files flag
131+
impg partition -p alignments.paf -w 1000000 -o gfa --sequence-files *.fa --separate-files --output-folder gfa_partitions
132+
impg partition -p alignments.paf -w 1000000 -o maf --sequence-list fastas.txt --separate-files --output-folder maf_partitions
133+
impg partition -p alignments.paf -w 1000000 -o fasta --sequence-files *.fa --separate-files --output-folder fasta_partitions
131134

132135
# Works with AGC archives too
133-
impg partition -p alignments.paf -w 1000000 -o gfa --sequence-files genomes.agc --output-folder gfa_partitions
136+
impg partition -p alignments.paf -w 1000000 -o gfa --sequence-files genomes.agc --separate-files --output-folder gfa_partitions
134137
```
135138

136139
### Similarity

src/main.rs

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -416,6 +416,10 @@ enum Args {
416416
/// Minimum distance from sequence start/end - closer regions will be extended to the boundaries
417417
#[clap(long, value_parser, default_value_t = 3000)]
418418
min_boundary_distance: i32,
419+
420+
/// Output separate files for each partition when 'bed'
421+
#[clap(long, action)]
422+
separate_files: bool,
419423
},
420424
/// Query overlaps in the alignment
421425
Query {
@@ -522,6 +526,7 @@ fn main() -> io::Result<()> {
522526
selection_mode,
523527
min_missing_size,
524528
min_boundary_distance,
529+
separate_files,
525530
} => {
526531
validate_selection_mode(&selection_mode)?;
527532
validate_output_format(&output_format, &["bed", "gfa", "maf", "fasta"])?;
@@ -534,6 +539,17 @@ fn main() -> io::Result<()> {
534539
gfa_maf_fasta.force_large_region,
535540
)?;
536541

542+
// Validate single-file output compatibility
543+
if !separate_files && output_format != "bed" {
544+
return Err(io::Error::new(
545+
io::ErrorKind::InvalidInput,
546+
format!(
547+
"Single-file output is only supported for BED format. Use --separate-files for {} format.",
548+
output_format.to_uppercase()
549+
),
550+
));
551+
}
552+
537553
// Extract reverse_complement before moving gfa_maf_fasta
538554
let reverse_complement = gfa_maf_fasta.reverse_complement;
539555

@@ -562,6 +578,7 @@ fn main() -> io::Result<()> {
562578
scoring_params,
563579
reverse_complement,
564580
common.verbose > 1,
581+
separate_files,
565582
)?;
566583
}
567584
Args::Query {

src/partition.rs

Lines changed: 119 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ pub fn partition_alignments(
4545
scoring_params: Option<(u8, u8, u8, u8, u8, u8)>,
4646
reverse_complement: bool,
4747
debug: bool,
48+
separate_files: bool,
4849
) -> io::Result<()> {
4950
// Initialize windows from starting sequences if provided
5051
let mut windows = Vec::<(u32, i32, i32)>::new();
@@ -158,6 +159,9 @@ pub fn partition_alignments(
158159

159160
// Track temporary BED files for GFA/MAF conversion
160161
let mut temp_bed_files = Vec::new();
162+
163+
// Collect partitions for single-file output
164+
let mut collected_partitions: Vec<(usize, Vec<Interval<u32>>)> = Vec::new();
161165

162166
while !windows.is_empty() {
163167
if debug {
@@ -329,45 +333,68 @@ pub fn partition_alignments(
329333
.map(|(query_interval, _, _)| query_interval)
330334
.collect();
331335

332-
// Write partition
333-
match output_format {
334-
"bed" => {
335-
// Write BED file directly
336-
write_partition_bed(
337-
partition_num,
338-
&query_intervals,
339-
impg,
340-
output_folder,
341-
None,
342-
)?;
343-
}
344-
"gfa" | "maf" => {
345-
// Write temporary BED file with .tmp suffix
346-
write_partition_bed(
347-
partition_num,
348-
&query_intervals,
349-
impg,
350-
output_folder,
351-
Some(".tmp"),
352-
)?;
353-
temp_bed_files.push(partition_num);
354-
}
355-
"fasta" => {
356-
// Write FASTA file directly
357-
write_partition_fasta(
358-
partition_num,
359-
&query_intervals,
360-
impg,
361-
output_folder,
362-
sequence_index.expect("Sequence index not found"),
363-
reverse_complement,
364-
)?;
336+
// Write partition or collect for single-file output
337+
if separate_files {
338+
// Legacy behavior: write separate files
339+
match output_format {
340+
"bed" => {
341+
// Write BED file directly
342+
write_partition_bed(
343+
partition_num,
344+
&query_intervals,
345+
impg,
346+
output_folder,
347+
None,
348+
)?;
349+
}
350+
"gfa" | "maf" => {
351+
// Write temporary BED file with .tmp suffix
352+
write_partition_bed(
353+
partition_num,
354+
&query_intervals,
355+
impg,
356+
output_folder,
357+
Some(".tmp"),
358+
)?;
359+
temp_bed_files.push(partition_num);
360+
}
361+
"fasta" => {
362+
// Write FASTA file directly
363+
write_partition_fasta(
364+
partition_num,
365+
&query_intervals,
366+
impg,
367+
output_folder,
368+
sequence_index.expect("Sequence index not found"),
369+
reverse_complement,
370+
)?;
371+
}
372+
_ => {
373+
return Err(io::Error::new(
374+
io::ErrorKind::InvalidInput,
375+
format!("Unsupported output format: {}", output_format),
376+
));
377+
}
365378
}
366-
_ => {
367-
return Err(io::Error::new(
368-
io::ErrorKind::InvalidInput,
369-
format!("Unsupported output format: {}", output_format),
370-
));
379+
} else {
380+
// New behavior: collect partitions for single-file output
381+
match output_format {
382+
"bed" => {
383+
// Collect BED partitions
384+
collected_partitions.push((partition_num, query_intervals.clone()));
385+
}
386+
"gfa" | "maf" | "fasta" => {
387+
return Err(io::Error::new(
388+
io::ErrorKind::InvalidInput,
389+
"Single-file output is only supported for BED format. Use --separate-files for GFA, MAF, or FASTA formats.".to_string(),
390+
));
391+
}
392+
_ => {
393+
return Err(io::Error::new(
394+
io::ErrorKind::InvalidInput,
395+
format!("Unsupported output format: {}", output_format),
396+
));
397+
}
371398
}
372399
}
373400

@@ -462,6 +489,21 @@ pub fn partition_alignments(
462489
})?;
463490
}
464491

492+
// Write collected partitions as single file if not using separate files
493+
if !separate_files && !collected_partitions.is_empty() {
494+
info!(
495+
"Writing {} partitions to single {} file",
496+
collected_partitions.len(),
497+
output_format
498+
);
499+
write_single_partition_file(
500+
&collected_partitions,
501+
impg,
502+
output_format,
503+
output_folder,
504+
)?;
505+
}
506+
465507
// Calculate final percentage
466508
let final_percentage = (total_partitioned_length as f64 / total_sequence_length as f64) * 100.0;
467509
// Create formatted percentage string with conditional scientific notation
@@ -1344,6 +1386,45 @@ fn write_partition_fasta(
13441386
Ok(())
13451387
}
13461388

1389+
fn write_single_partition_file(
1390+
collected_partitions: &[(usize, Vec<Interval<u32>>)],
1391+
impg: &Impg,
1392+
output_format: &str,
1393+
output_folder: Option<&str>,
1394+
) -> io::Result<()> {
1395+
match output_format {
1396+
"bed" => {
1397+
// Create single BED file with partition column
1398+
let filename = "partitions.bed";
1399+
let full_path = create_output_path(output_folder, filename)?;
1400+
let file = File::create(full_path)?;
1401+
let mut writer = BufWriter::new(file);
1402+
1403+
for (partition_num, query_intervals) in collected_partitions {
1404+
for query_interval in query_intervals {
1405+
let name = impg.seq_index.get_name(query_interval.metadata).unwrap();
1406+
let (start, end) = if query_interval.first <= query_interval.last {
1407+
(query_interval.first, query_interval.last)
1408+
} else {
1409+
(query_interval.last, query_interval.first)
1410+
};
1411+
1412+
writeln!(writer, "{} {} {} {}", name, start, end, partition_num)?;
1413+
}
1414+
}
1415+
1416+
writer.flush()?;
1417+
Ok(())
1418+
}
1419+
_ => {
1420+
Err(io::Error::new(
1421+
io::ErrorKind::InvalidInput,
1422+
format!("Single-file output not supported for format: {}", output_format),
1423+
))
1424+
}
1425+
}
1426+
}
1427+
13471428
pub fn parse_bed_file(bed_file: &str) -> io::Result<Vec<(String, (i32, i32), String)>> {
13481429
let file = File::open(bed_file)?;
13491430
let reader = BufReader::new(file);

0 commit comments

Comments
 (0)