Skip to content

Commit 212d684

Browse files
committed
feat: add metagenomic mode support and update gene scoring logic
1 parent 574402a commit 212d684

File tree

9 files changed

+277
-46
lines changed

9 files changed

+277
-46
lines changed

orphos-cli/benches/prodigal_benchmarks.rs

Lines changed: 132 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,16 @@ fn run_original_prodigal(
1212
input_file: &str,
1313
output_file: &str,
1414
threads: Option<usize>,
15+
) -> Result<Duration, Box<dyn std::error::Error>> {
16+
run_original_prodigal_with_mode(input_file, output_file, threads, "single")
17+
}
18+
19+
// Helper function to run original prodigal with specified mode
20+
fn run_original_prodigal_with_mode(
21+
input_file: &str,
22+
output_file: &str,
23+
threads: Option<usize>,
24+
mode: &str,
1525
) -> Result<Duration, Box<dyn std::error::Error>> {
1626
let start = std::time::Instant::now();
1727

@@ -23,7 +33,7 @@ fn run_original_prodigal(
2333
.arg("-f")
2434
.arg("gff")
2535
.arg("-p")
26-
.arg("single");
36+
.arg(mode);
2737

2838
// Original prodigal doesn't have threading, but we include this for consistency
2939
if threads.is_some() {
@@ -49,6 +59,16 @@ fn run_pyrodigal(
4959
input_file: &str,
5060
output_file: &str,
5161
threads: Option<usize>,
62+
) -> Result<Duration, Box<dyn std::error::Error>> {
63+
run_pyrodigal_with_mode(input_file, output_file, threads, "single")
64+
}
65+
66+
// Helper function to run pyrodigal with specified mode
67+
fn run_pyrodigal_with_mode(
68+
input_file: &str,
69+
output_file: &str,
70+
threads: Option<usize>,
71+
mode: &str,
5272
) -> Result<Duration, Box<dyn std::error::Error>> {
5373
let start = std::time::Instant::now();
5474

@@ -60,7 +80,7 @@ fn run_pyrodigal(
6080
.arg("-f")
6181
.arg("gff")
6282
.arg("-p")
63-
.arg("single");
83+
.arg(mode);
6484

6585
if let Some(t) = threads {
6686
cmd.arg("-j").arg(t.to_string());
@@ -85,6 +105,16 @@ fn run_orphos_cli(
85105
input_file: &str,
86106
output_file: &str,
87107
threads: Option<usize>,
108+
) -> Result<Duration, Box<dyn std::error::Error>> {
109+
run_orphos_cli_with_mode(input_file, output_file, threads, "single")
110+
}
111+
112+
// Helper function to run orphos with specified mode
113+
fn run_orphos_cli_with_mode(
114+
input_file: &str,
115+
output_file: &str,
116+
threads: Option<usize>,
117+
mode: &str,
88118
) -> Result<Duration, Box<dyn std::error::Error>> {
89119
let start = std::time::Instant::now();
90120

@@ -106,7 +136,7 @@ fn run_orphos_cli(
106136
.arg("-f")
107137
.arg("gff")
108138
.arg("-p")
109-
.arg("single");
139+
.arg(mode);
110140

111141
let output = cmd.output()?;
112142
let duration = start.elapsed();
@@ -426,11 +456,109 @@ fn benchmark_scaling_analysis(c: &mut Criterion) {
426456
group.finish();
427457
}
428458

459+
/// Benchmark meta mode (metagenomic) - single threaded comparison
460+
fn benchmark_meta_mode_single_threaded(c: &mut Criterion) {
461+
let test_files = [
462+
("ecoli", "tests/data/ecoli.fasta"),
463+
("salmonella", "tests/data/salmonella.fasta"),
464+
];
465+
466+
for (name, input_file) in test_files {
467+
if !Path::new(input_file).exists() {
468+
eprintln!("Warning: {} not found, skipping benchmark", input_file);
469+
continue;
470+
}
471+
472+
let file_size = get_file_size(input_file);
473+
let mut group = c.benchmark_group(format!("{}_meta_single_threaded", name));
474+
group.throughput(Throughput::Bytes(file_size));
475+
476+
// Check if original prodigal is available
477+
if Command::new("prodigal").arg("--help").output().is_ok() {
478+
group.bench_function("original_prodigal_meta", |b| {
479+
b.iter_custom(|iters| {
480+
let mut total_duration = Duration::new(0, 0);
481+
for _ in 0..iters {
482+
let output_file = NamedTempFile::new().unwrap();
483+
let duration = run_original_prodigal_with_mode(
484+
black_box(input_file),
485+
output_file.path().to_str().unwrap(),
486+
None,
487+
"meta",
488+
)
489+
.unwrap_or_else(|e| {
490+
eprintln!("Original prodigal meta benchmark failed: {}", e);
491+
Duration::from_secs(0)
492+
});
493+
total_duration += duration;
494+
}
495+
total_duration
496+
});
497+
});
498+
} else {
499+
eprintln!(
500+
"Warning: Original prodigal not found, skipping original_prodigal_meta benchmark"
501+
);
502+
}
503+
504+
// Check if pyrodigal is available
505+
if Command::new("pyrodigal").arg("--help").output().is_ok() {
506+
group.bench_function("pyrodigal_meta", |b| {
507+
b.iter_custom(|iters| {
508+
let mut total_duration = Duration::new(0, 0);
509+
for _ in 0..iters {
510+
let output_file = NamedTempFile::new().unwrap();
511+
let duration = run_pyrodigal_with_mode(
512+
black_box(input_file),
513+
output_file.path().to_str().unwrap(),
514+
Some(1),
515+
"meta",
516+
)
517+
.unwrap_or_else(|e| {
518+
eprintln!("Pyrodigal meta benchmark failed: {}", e);
519+
Duration::from_secs(0)
520+
});
521+
total_duration += duration;
522+
}
523+
total_duration
524+
});
525+
});
526+
} else {
527+
eprintln!("Warning: Pyrodigal not found, skipping pyrodigal_meta benchmark");
528+
}
529+
530+
// Benchmark orphos meta mode (single-threaded)
531+
group.bench_function("orphos_meta", |b| {
532+
b.iter_custom(|iters| {
533+
let mut total_duration = Duration::new(0, 0);
534+
for _ in 0..iters {
535+
let output_file = NamedTempFile::new().unwrap();
536+
let duration = run_orphos_cli_with_mode(
537+
black_box(input_file),
538+
output_file.path().to_str().unwrap(),
539+
Some(1),
540+
"meta",
541+
)
542+
.unwrap_or_else(|e| {
543+
eprintln!("orphos-cli meta benchmark failed: {}", e);
544+
Duration::from_secs(0)
545+
});
546+
total_duration += duration;
547+
}
548+
total_duration
549+
});
550+
});
551+
552+
group.finish();
553+
}
554+
}
555+
429556
criterion_group!(
430557
name = benches;
431558
config = configure_criterion();
432559
targets = benchmark_single_threaded,
433560
// benchmark_multi_threaded,
434-
benchmark_scaling_analysis
561+
benchmark_scaling_analysis,
562+
benchmark_meta_mode_single_threaded
435563
);
436564
criterion_main!(benches);

orphos-cli/tests/snapshots/meta_genomes_equivalence__meta_genomes_gff_summary.snap

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,16 @@ source: orphos-cli/tests/meta_genomes_equivalence.rs
33
expression: summary
44
---
55
file similarity(%) coord_match(%) exact orig_hash rust_hash
6-
NC_000913.3.fasta 5.957 99.652 no 9995c9861491 624c54e087bd
7-
NC_000915.1.fasta 0.829 99.171 no 661adac5bfad 3f661f42ee96
8-
NC_000962.3.fasta 9.641 99.799 no 5a26d7cdd39c 77a5b452ab9b
9-
NC_000964.3.fasta 1.017 99.410 no 80bd80b02c2e dbf14d16e018
10-
NC_001318.1.fasta 15.599 100.000 no fd3c884611ff 0966ac145b56
11-
NC_002516.2.fasta 16.726 99.930 no 38c34e144238 dac532d8e321
12-
NC_002528.1.fasta 38.384 100.000 no f7f46f0a7a71 22cc1aff9eae
13-
NC_002745.2.fasta 3.599 99.846 no 797f59fb9866 4cf1d1ca1aad
14-
NC_003888.3.fasta 4.550 99.923 no 082e59efe934 0651fa453cde
15-
NC_005072.1.fasta 13.788 99.736 no f1398c92c767 94f273ed0d9a
16-
NC_008512.1.fasta 24.599 100.000 no 927216b98ab4 fcbf39edf768
17-
fragment.fasta 14.761 99.708 no 17bf514adad9 f1da8e43ec38
18-
fragment2.fasta 25.926 100.000 no df7200909f9a 2cb02a1b7e96
6+
NC_000913.3.fasta 56.452 99.930 no 9995c9861491 41c6dad6f214
7+
NC_000915.1.fasta 61.900 99.873 no 661adac5bfad 718323dfeee3
8+
NC_000962.3.fasta 28.998 99.950 no 5a26d7cdd39c 107315ed518a
9+
NC_000964.3.fasta 9.312 100.000 no 80bd80b02c2e 76f7d8cda2d5
10+
NC_001318.1.fasta 57.557 100.000 no fd3c884611ff 333684f6b76e
11+
NC_002516.2.fasta 20.098 99.965 no 38c34e144238 b5d18dc81ee8
12+
NC_002528.1.fasta 100.000 100.000 yes f7f46f0a7a71 f7f46f0a7a71
13+
NC_002745.2.fasta 12.859 99.962 no 797f59fb9866 2b72e6297f92
14+
NC_003888.3.fasta 10.750 99.923 no 082e59efe934 f68bb4b2a257
15+
NC_005072.1.fasta 13.664 100.000 no f1398c92c767 11103121e304
16+
NC_008512.1.fasta 100.000 100.000 yes 927216b98ab4 927216b98ab4
17+
fragment.fasta 99.422 100.000 no 17bf514adad9 18f398b80b81
18+
fragment2.fasta 100.000 100.000 yes df7200909f9a df7200909f9a
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
---
2+
source: orphos-cli/tests/meta_genomes_equivalence.rs
3+
expression: summary
4+
---
5+
file similarity(%) coord_match(%) exact orig_hash rust_hash
6+
ecoli.fasta 56.452 99.930 no 00a4ae70baa3 73394cf5247e
7+
salmonella.fasta 17.403 99.957 no 1f477c1793a5 a25cdf8bc731

orphos-core/src/algorithms/gene_finding/annotation.rs

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ use bio::bio_types::strand::Strand;
33
use crate::{
44
constants::{MAX_CONFIDENCE_SCORE, RBS_DESCRIPTIONS},
55
sequence::mer_text,
6-
types::{Gene, GeneAnnotation, GeneScore, Node, StartType, Training},
6+
types::{CodonType, Gene, GeneAnnotation, GeneScore, Node, StartType, Training},
77
};
88

99
/// Record gene annotation and score.
@@ -125,6 +125,48 @@ fn calculate_confidence(score: f64, start_weight: f64) -> f64 {
125125
}
126126
}
127127

128+
/// Update gene scores from re-scored nodes.
129+
///
130+
/// This function is used in metagenomic mode to update gene scores after the final
131+
/// re-scoring with the best model. It finds the corresponding start node by matching
132+
/// the gene's begin/end positions and strand, then updates the score fields.
133+
///
134+
/// # Arguments
135+
/// * `genes` - Mutable slice of genes to update
136+
/// * `nodes` - Slice of re-scored nodes
137+
/// * `training` - Training parameters for confidence calculation
138+
pub fn update_gene_scores_from_nodes(genes: &mut [Gene], nodes: &[Node], training: &Training) {
139+
for gene in genes.iter_mut() {
140+
// Find the start node by matching position and strand
141+
// For forward strand: start_node.index + 1 == gene.begin
142+
// For reverse strand: start_node.index + 1 == gene.end
143+
let target_pos = if gene.coordinates.strand == Strand::Forward {
144+
gene.coordinates.begin.saturating_sub(1)
145+
} else {
146+
gene.coordinates.end.saturating_sub(1)
147+
};
148+
149+
if let Some(start_node) = nodes.iter().find(|n| {
150+
n.position.index == target_pos
151+
&& n.position.strand == gene.coordinates.strand
152+
&& n.position.codon_type != CodonType::Stop
153+
}) {
154+
let total_score = start_node.scores.coding_score + start_node.scores.start_score;
155+
let confidence = calculate_confidence(total_score, training.start_weight_factor);
156+
157+
gene.score = GeneScore {
158+
confidence,
159+
total_score,
160+
coding_score: start_node.scores.coding_score,
161+
start_score: start_node.scores.start_score,
162+
ribosome_binding_score: start_node.scores.ribosome_binding_score,
163+
upstream_score: start_node.scores.upstream_score,
164+
type_score: start_node.scores.type_score,
165+
};
166+
}
167+
}
168+
}
169+
128170
#[cfg(test)]
129171
mod tests {
130172
use super::*;
@@ -186,6 +228,7 @@ mod tests {
186228
},
187229
score: GeneScore::default(),
188230
annotation: GeneAnnotation::default(),
231+
display_score: None,
189232
}
190233
}
191234

orphos-core/src/algorithms/gene_finding/mod.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,4 +3,5 @@ pub mod creation;
33
pub mod optimization;
44
pub mod scoring;
55

6+
pub use annotation::update_gene_scores_from_nodes;
67
pub use optimization::GeneBuilder;

0 commit comments

Comments
 (0)