Skip to content

Commit 75a9079

Browse files
committed
add process subcommand that runs analyse and cluster
1 parent 22b75c5 commit 75a9079

File tree

11 files changed

+1164
-35
lines changed

11 files changed

+1164
-35
lines changed

src/analyse.rs

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ pub fn analyse<Q: QualityFinder, R: RrnaFinder, T: TrnaFinder>(
2525
rrna_finder: &R,
2626
trna_finder: &T,
2727
checkm2_quality_report: &Option<String>,
28+
output_quality_report_path: &Option<String>,
2829
checkm_tab_table: &Option<String>,
2930
barrnap_gff_list: &Option<String>,
3031
trnascan_out_list: &Option<String>,
@@ -119,6 +120,27 @@ pub fn analyse<Q: QualityFinder, R: RrnaFinder, T: TrnaFinder>(
119120
cache
120121
} else {
121122
quality_finder.prepare_comp_cont(genomes, threads, tmp_path);
123+
124+
// If requested, copy the CheckM2 quality report out of the temporary directory now
125+
if let Some(dest) = output_quality_report_path {
126+
let src = tmp_path.join("checkm2").join("quality_report.tsv");
127+
if let Some(parent) = std::path::Path::new(dest).parent() {
128+
if !parent.as_os_str().is_empty() {
129+
std::fs::create_dir_all(parent).map_err(|e| {
130+
format!("Failed to create parent directory for quality report output: {e}")
131+
})?;
132+
}
133+
}
134+
std::fs::copy(&src, dest).map_err(|e| {
135+
format!(
136+
"Failed to copy CheckM2 quality report from {} to {}: {}",
137+
src.display(),
138+
dest,
139+
e
140+
)
141+
})?;
142+
}
143+
122144
genomes
123145
.iter()
124146
.map(|g| (g.clone(), quality_finder.find_comp_cont(g)))
@@ -219,7 +241,7 @@ fn parse_barrnap_gff_list(
219241
Ok(rrna_cache)
220242
}
221243

222-
/// Parse two-column TSV mapping genome names to tRNAscan-SE output files
244+
/// Parse two-column TSV mapping genome names to tRNAscan-SE output files
223245
fn parse_trnascan_out_list(list_path: &str) -> Result<HashMap<String, usize>, String> {
224246
let mut trna_cache = HashMap::new();
225247
let content = std::fs::read_to_string(list_path)

src/analyse_argument_parsing.rs

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -88,14 +88,18 @@ pub struct GalahAnalyser<'a> {
8888
}
8989

9090
impl GalahAnalyser<'_> {
91-
pub fn analyse(&mut self) -> Result<std::collections::HashMap<String, GenomeOutput>, String> {
91+
pub fn analyse(
92+
&mut self,
93+
output_quality_report_path: &Option<String>,
94+
) -> Result<std::collections::HashMap<String, GenomeOutput>, String> {
9295
crate::analyse::analyse(
9396
self.genome_fasta_files,
9497
self.threads,
9598
&mut self.quality_analyser,
9699
&self.rrna_analyser,
97100
&self.trna_analyser,
98101
&self.checkm2_quality_report,
102+
output_quality_report_path,
99103
&self.checkm_tab_table,
100104
&self.barrnap_gff_list,
101105
&self.trnascan_out_list,
@@ -108,6 +112,7 @@ pub struct GalahAnalyserCommandDefinition {
108112
pub rrna_method_argument: String,
109113
pub trna_method_argument: String,
110114
pub output_mimag_summary_argument: String,
115+
pub output_quality_report_argument: String,
111116
pub checkm2_db_path_argument: String,
112117
pub checkm2_quality_report_argument: String,
113118
pub checkm_tab_table_argument: String,
@@ -116,12 +121,13 @@ pub struct GalahAnalyserCommandDefinition {
116121
}
117122

118123
lazy_static! {
119-
static ref ANALYSE_COMMAND_DEFINITION: GalahAnalyserCommandDefinition = {
124+
pub static ref ANALYSE_COMMAND_DEFINITION: GalahAnalyserCommandDefinition = {
120125
GalahAnalyserCommandDefinition {
121126
quality_method_argument: "quality-method".to_string(),
122127
rrna_method_argument: "rrna-method".to_string(),
123128
trna_method_argument: "trna-method".to_string(),
124129
output_mimag_summary_argument: "output-mimag-summary".to_string(),
130+
output_quality_report_argument: "output-quality-report".to_string(),
125131
checkm2_db_path_argument: "checkm2-db-path".to_string(),
126132
checkm2_quality_report_argument: "checkm2-quality-report".to_string(),
127133
checkm_tab_table_argument: "checkm-tab-table".to_string(),
@@ -314,7 +320,21 @@ pub fn add_analyse_subcommand(app: clap::Command) -> clap::Command {
314320
Arg::new(&*ANALYSE_COMMAND_DEFINITION.output_mimag_summary_argument)
315321
.long("output-mimag-summary")
316322
.value_name("SUMMARY")
317-
.help("Path to output MIMAG summary file"),
323+
.help("Path to output MIMAG summary file")
324+
.required_unless_present_any([
325+
"output-quality-report",
326+
"full-help",
327+
"full-help-roff",]),
328+
)
329+
.arg(
330+
Arg::new(&*ANALYSE_COMMAND_DEFINITION.output_quality_report_argument)
331+
.long("output-quality-report")
332+
.value_name("REPORT")
333+
.help("Path to output CheckM2-format quality report")
334+
.required_unless_present_any([
335+
"output-mimag-summary",
336+
"full-help",
337+
"full-help-roff",]),
318338
)
319339
.arg(
320340
Arg::new(&*ANALYSE_COMMAND_DEFINITION.rrna_method_argument)
@@ -460,15 +480,22 @@ pub fn add_analyse_output_parameters_to_section(
460480
section: Section,
461481
definition: &GalahAnalyserCommandDefinition,
462482
) -> Section {
463-
section.option(
464-
Opt::new("PATH")
465-
.long(&format!("--{}", definition.output_mimag_summary_argument))
466-
.help("Output a tsv file summarising the MIMAG status for each genome."),
467-
)
483+
section
484+
.option(
485+
Opt::new("PATH")
486+
.long(&format!("--{}", definition.output_mimag_summary_argument))
487+
.help("Output a tsv file summarising the MIMAG status for each genome."),
488+
)
489+
.option(
490+
Opt::new("PATH")
491+
.long(&format!("--{}", definition.output_quality_report_argument))
492+
.help("Output a CheckM2-format quality report TSV file."),
493+
)
468494
}
469495

470496
pub struct AnalyseOutput {
471497
pub output_mimag_summary: Option<std::fs::File>,
498+
pub output_quality_report_path: Option<String>,
472499
}
473500

474501
pub fn setup_analyse_outputs(
@@ -479,8 +506,13 @@ pub fn setup_analyse_outputs(
479506
.get_one::<String>(&command_definition.output_mimag_summary_argument)
480507
.map(|o| std::fs::File::create(o).expect("Failed to open output MIMAG summary file"));
481508

509+
let output_quality_report_path = m
510+
.get_one::<String>(&command_definition.output_quality_report_argument)
511+
.map(|s| s.to_string());
512+
482513
AnalyseOutput {
483514
output_mimag_summary,
515+
output_quality_report_path,
484516
}
485517
}
486518

@@ -511,9 +543,11 @@ pub fn run_analyse_subcommand(
511543
let output_definitions = setup_analyse_outputs(m, &ANALYSE_COMMAND_DEFINITION);
512544

513545
info!("Analysing {} genomes ..", genome_fasta_files.len());
514-
let analysis = galah.analyse().expect("Failed to analyse genomes");
546+
let analysis = galah
547+
.analyse(&output_definitions.output_quality_report_path)
548+
.expect("Failed to analyse genomes");
515549

516-
write_analyse_outputs(output_definitions, &analysis, genome_fasta_files);
550+
write_analyse_outputs(output_definitions, &analysis, &genome_fasta_files);
517551
info!("Finished printing genome analysis");
518552
}
519553

@@ -580,10 +614,10 @@ fn generate_galah_analyser<'a>(
580614
})
581615
}
582616

583-
fn write_analyse_outputs(
617+
pub fn write_analyse_outputs(
584618
output_definitions: AnalyseOutput,
585619
analysis: &HashMap<String, GenomeOutput>,
586-
genome_fasta_files: Vec<String>,
620+
genome_fasta_files: &Vec<String>,
587621
) {
588622
if let Some(mut f) = output_definitions.output_mimag_summary {
589623
writeln!(
@@ -592,7 +626,7 @@ fn write_analyse_outputs(
592626
)
593627
.unwrap();
594628
for genome in genome_fasta_files {
595-
if let Some(output_data) = analysis.get(&*genome) {
629+
if let Some(output_data) = analysis.get(&**genome) {
596630
writeln!(
597631
f,
598632
"{genome}\t{completeness:.2}\t{contamination:.2}\t{r5s}\t{r16s}\t{r23s}\t{trnas}\t{mimag_quality}",

src/checkm2.rs

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,36 @@ pub struct CheckM2Analyser {
1212
// Cache for completeness and contamination results
1313
pub comp_cont_cache: HashMap<String, (f64, f64)>,
1414
pub database_path: String,
15+
pub quality_report_source_path: Option<std::path::PathBuf>,
1516
}
1617

1718
impl CheckM2Analyser {
1819
pub fn new(database_path: String) -> Self {
1920
Self {
2021
comp_cont_cache: HashMap::new(),
2122
database_path,
23+
quality_report_source_path: None,
24+
}
25+
}
26+
27+
pub fn copy_quality_report(&self, dest_path: &str) -> Result<(), String> {
28+
if let Some(src_path) = &self.quality_report_source_path {
29+
std::fs::copy(src_path, dest_path).map_err(|e| {
30+
format!("Failed to copy quality report from {src_path:?} to {dest_path}: {e}")
31+
})?;
32+
Ok(())
33+
} else {
34+
Err("No quality report available to copy (CheckM2 may not have been run)".to_string())
2235
}
2336
}
2437
}
2538

2639
impl QualityFinder for CheckM2Analyser {
2740
fn prepare_comp_cont(&mut self, genome_paths: &[String], threads: usize, tmp_path: &Path) {
28-
self.comp_cont_cache = get_comp_cont(genome_paths, threads, tmp_path, &self.database_path);
41+
let (cache, quality_report_path) =
42+
get_comp_cont(genome_paths, threads, tmp_path, &self.database_path);
43+
self.comp_cont_cache = cache;
44+
self.quality_report_source_path = Some(quality_report_path);
2945
}
3046

3147
fn find_comp_cont(&self, genome_path: &str) -> (f64, f64) {
@@ -45,7 +61,7 @@ fn get_comp_cont(
4561
threads: usize,
4662
tmp_path: &Path,
4763
database_path: &str,
48-
) -> HashMap<String, (f64, f64)> {
64+
) -> (HashMap<String, (f64, f64)>, std::path::PathBuf) {
4965
let mut comp_cont_cache = HashMap::new();
5066
let checkm2_path = tmp_path.join("checkm2");
5167

@@ -136,5 +152,5 @@ fn get_comp_cont(
136152
);
137153
}
138154
}
139-
comp_cont_cache
155+
(comp_cont_cache, quality_report_path)
140156
}

src/cluster_argument_parsing.rs

Lines changed: 33 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -182,13 +182,13 @@ lazy_static! {
182182
183183
{}
184184
185-
{} cluster --genome-fasta-directory input_genomes/
185+
{} cluster --genome-fasta-directory input_genomes/
186186
--output-representative-fasta-directory output_directory/
187187
188188
{}
189189
190190
{} cluster --ani 95 --precluster-ani 90 --precluster-method finch
191-
--genome-fasta-list genomes.txt
191+
--genome-fasta-list genomes.txt
192192
--output-cluster-definition clusters.tsv
193193
194194
{}
@@ -199,7 +199,7 @@ lazy_static! {
199199
200200
{}
201201
202-
{} cluster --cluster-contigs --small-contigs --genome-fasta-files contigs.fasta
202+
{} cluster --cluster-contigs --small-contigs --genome-fasta-files contigs.fasta
203203
--output-cluster-definition contig_clusters.tsv
204204
205205
See {} cluster --full-help for further options and further detail.
@@ -308,7 +308,7 @@ pub fn add_dereplication_clustering_parameters_to_section(
308308
.option(
309309
Opt::new("FLOAT")
310310
.long(&format!("--{}", definition.dereplication_ani_argument))
311-
.help(&format!("Overall ANI level to dereplicate at with the primary clusterer. {}",
311+
.help(&format!("Overall ANI level to dereplicate at with the primary clusterer. {}",
312312
&default_roff(crate::DEFAULT_ANI))),
313313
)
314314
.option(
@@ -579,9 +579,11 @@ pub fn run_cluster_subcommand(
579579
}
580580

581581
let contig_names_owned = if cluster_contigs {
582-
if m.contains_id("output-representative-fasta-directory")
583-
|| m.contains_id("output-representative-fasta-directory-copy")
584-
{
582+
if m.contains_id(
583+
&GALAH_COMMAND_DEFINITION.dereplication_output_representative_fasta_directory,
584+
) || m.contains_id(
585+
&GALAH_COMMAND_DEFINITION.dereplication_output_representative_fasta_directory_copy,
586+
) {
585587
panic!("Cannot specify --cluster-contigs with --output-representative-fasta-directory or --output-representative-fasta-directory-copy");
586588
}
587589

@@ -674,6 +676,7 @@ pub fn run_cluster_subcommand(
674676
m,
675677
&GALAH_COMMAND_DEFINITION,
676678
ref_genomes_for_clusterer.as_deref(),
679+
None,
677680
)
678681
.expect("Failed to parse galah clustering arguments correctly");
679682

@@ -844,6 +847,7 @@ pub fn filter_genomes_through_checkm<'a>(
844847
genome_fasta_files: &'a Vec<String>,
845848
clap_matches: &clap::ArgMatches,
846849
argument_definition: &GalahClustererCommandDefinition,
850+
injected_quality_report: Option<String>,
847851
) -> std::result::Result<Vec<&'a str>, String> {
848852
if clap_matches.get_flag(&argument_definition.dereplication_cluster_contigs_argument) {
849853
return Ok(genome_fasta_files.iter().map(|s| &**s).collect());
@@ -852,7 +856,9 @@ pub fn filter_genomes_through_checkm<'a>(
852856
match clap_matches.contains_id("checkm-tab-table")
853857
|| clap_matches.contains_id("genome-info")
854858
|| clap_matches.contains_id("checkm2-quality-report")
855-
|| (clap_matches.contains_id("run-checkm2") && clap_matches.get_flag("run-checkm2"))
859+
|| injected_quality_report.is_some()
860+
|| (clap_matches.contains_id(&argument_definition.dereplication_run_checkm2_argument)
861+
&& clap_matches.get_flag(&argument_definition.dereplication_run_checkm2_argument))
856862
{
857863
false => {
858864
warn!("Since CheckM input has not been provided and CheckM2 has been disabled, genomes are not being ordered by quality. Instead the order of their input is being used");
@@ -897,11 +903,20 @@ pub fn filter_genomes_through_checkm<'a>(
897903
)
898904
.expect("Error parsing genomeInfo file"),
899905
}
900-
} else if clap_matches.contains_id("run-checkm2")
901-
&& clap_matches.get_flag("run-checkm2")
906+
} else if injected_quality_report.is_some() {
907+
info!("Reading injected CheckM2 Quality report ..");
908+
CheckMResultEnum::CheckM2Result {
909+
result: checkm::CheckM2QualityReport::read_file_path(
910+
injected_quality_report.as_deref().unwrap(),
911+
)
912+
.unwrap(),
913+
}
914+
} else if clap_matches
915+
.contains_id(&argument_definition.dereplication_run_checkm2_argument)
916+
&& clap_matches.get_flag(&argument_definition.dereplication_run_checkm2_argument)
902917
{
903918
// Run CheckM2 as in analyse
904-
let db_path = clap_matches.get_one::<String>("checkm2-db-path")
919+
let db_path = clap_matches.get_one::<String>(&argument_definition.dereplication_checkm2_db_path_argument)
905920
.map(|s| s.to_string())
906921
.or_else(|| std::env::var("CHECKM2DB").ok())
907922
.expect("CheckM2 database path must be provided via --checkm2-db-path or CHECKM2DB env var");
@@ -1190,6 +1205,7 @@ pub fn generate_galah_clusterer<'a>(
11901205
clap_matches: &clap::ArgMatches,
11911206
argument_definition: &GalahClustererCommandDefinition,
11921207
reference_genomes: Option<&[&str]>,
1208+
injected_quality_report: Option<String>,
11931209
) -> std::result::Result<GalahClusterer<'a>, String> {
11941210
crate::external_command_checker::check_for_fastani();
11951211

@@ -1204,7 +1220,12 @@ pub fn generate_galah_clusterer<'a>(
12041220
.as_str()
12051221
};
12061222

1207-
match filter_genomes_through_checkm(genome_fasta_paths, clap_matches, argument_definition) {
1223+
match filter_genomes_through_checkm(
1224+
genome_fasta_paths,
1225+
clap_matches,
1226+
argument_definition,
1227+
injected_quality_report,
1228+
) {
12081229
Err(e) => std::result::Result::Err(e),
12091230

12101231
Ok(v2) => {

src/lib.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ pub mod fastani;
1010
pub mod finch;
1111
pub mod genome_info_file;
1212
pub mod genome_stats;
13+
pub mod process;
14+
pub mod process_argument_parsing;
1315
pub mod skani;
1416
pub mod sorted_pair_genome_distance_cache;
1517
pub mod trnascan;

src/main.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@ fn main() {
5353
crate_version!(),
5454
);
5555
}
56+
Some("process") => {
57+
galah::process_argument_parsing::run_process_subcommand(
58+
&matches,
59+
"galah",
60+
crate_version!(),
61+
);
62+
}
5663
_ => panic!("Programming error"),
5764
}
5865
}
@@ -122,5 +129,6 @@ fn build_cli() -> Command {
122129

123130
app = galah::cluster_argument_parsing::add_cluster_subcommand(app);
124131
app = galah::analyse_argument_parsing::add_analyse_subcommand(app);
132+
app = galah::process_argument_parsing::add_process_subcommand(app);
125133
app
126134
}

0 commit comments

Comments
 (0)