Skip to content

Commit 946db0b

Browse files
committed
allow checkm2 running in cluster mode
1 parent 97d1a47 commit 946db0b

File tree

2 files changed

+142
-1
lines changed

2 files changed

+142
-1
lines changed

src/cluster_argument_parsing.rs

Lines changed: 44 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ use crate::skani::SkaniClusterer;
1111
use crate::skani::SkaniPreclusterer;
1212
use crate::ClusterDistanceFinder;
1313
use crate::PreclusterDistanceFinder;
14+
use crate::QualityFinder;
1415
use crate::SortedPairGenomeDistanceCache;
1516
use bird_tool_utils::clap_utils::*;
1617
use bird_tool_utils::clap_utils::{default_roff, monospace_roff};
@@ -117,6 +118,8 @@ pub struct GalahClustererCommandDefinition {
117118
pub dereplication_ani_argument: String,
118119
pub dereplication_prethreshold_ani_argument: String,
119120
pub dereplication_quality_formula_argument: String,
121+
pub dereplication_run_checkm2_argument: String,
122+
pub dereplication_checkm2_db_path_argument: String,
120123
pub dereplication_precluster_method_argument: String,
121124
pub dereplication_cluster_method_argument: String,
122125
pub dereplication_aligned_fraction_argument: String,
@@ -140,6 +143,8 @@ lazy_static! {
140143
dereplication_ani_argument: "ani".to_string(),
141144
dereplication_prethreshold_ani_argument: "precluster-ani".to_string(),
142145
dereplication_quality_formula_argument: "quality-formula".to_string(),
146+
dereplication_run_checkm2_argument: "run-checkm2".to_string(),
147+
dereplication_checkm2_db_path_argument: "checkm2-db-path".to_string(),
143148
dereplication_precluster_method_argument: "precluster-method".to_string(),
144149
dereplication_cluster_method_argument: "cluster-method".to_string(),
145150
dereplication_aligned_fraction_argument: "min-aligned-fraction".to_string(),
@@ -284,6 +289,15 @@ pub fn add_dereplication_filtering_parameters_to_section(section: Section) -> Se
284289
"Ignore genomes with more contamination than \
285290
this percentage. [default: not set]",
286291
))
292+
.flag(Flag::new().long("--run-checkm2").help(
293+
"Run CheckM2 to generate quality scoring used for clustering. Requires \
294+
--checkm2-db-path or CHECKM2DB env variable to be set.",
295+
))
296+
.option(
297+
Opt::new("DB_PATH").long("--checkm2-db-path").help(
298+
"Path to CheckM2 database (required for running CheckM2) \
299+
[default: from CHECKM2DB environment variable]",
300+
))
287301
}
288302

289303
pub fn add_dereplication_clustering_parameters_to_section(
@@ -838,9 +852,10 @@ pub fn filter_genomes_through_checkm<'a>(
838852
match clap_matches.contains_id("checkm-tab-table")
839853
|| clap_matches.contains_id("genome-info")
840854
|| clap_matches.contains_id("checkm2-quality-report")
855+
|| clap_matches.contains_id("run-checkm2")
841856
{
842857
false => {
843-
warn!("Since CheckM input is missing, genomes are not being ordered by quality. Instead the order of their input is being used");
858+
warn!("Since CheckM input has not been provided and CheckM2 has been disabled, genomes are not being ordered by quality. Instead the order of their input is being used");
844859
Ok(genome_fasta_files.iter().map(|s| &**s).collect())
845860
}
846861
true => {
@@ -882,6 +897,27 @@ pub fn filter_genomes_through_checkm<'a>(
882897
)
883898
.expect("Error parsing genomeInfo file"),
884899
}
900+
} else if clap_matches.contains_id("run-checkm2") {
901+
// Run CheckM2 as in analyse
902+
let db_path = clap_matches.get_one::<String>("checkm2-db-path")
903+
.map(|s| s.to_string())
904+
.or_else(|| std::env::var("CHECKM2DB").ok())
905+
.expect("CheckM2 database path must be provided via --checkm2-db-path or CHECKM2DB env var");
906+
use crate::checkm2::CheckM2Analyser;
907+
let tmpdir = tempfile::tempdir().expect("Failed to create tempdir for CheckM2");
908+
let tmp_path = tmpdir.path();
909+
let mut analyser = CheckM2Analyser::new(db_path);
910+
analyser.prepare_comp_cont(genome_fasta_files, 1, tmp_path);
911+
CheckMResultEnum::CheckM2Result {
912+
result: {
913+
let quality_report_path =
914+
tmp_path.join("checkm2").join("quality_report.tsv");
915+
checkm::CheckM2QualityReport::read_file_path(
916+
quality_report_path.to_str().unwrap(),
917+
)
918+
.unwrap()
919+
},
920+
}
885921
} else {
886922
panic!("Programming error");
887923
};
@@ -1568,6 +1604,13 @@ pub fn add_cluster_subcommand(app: clap::Command) -> clap::Command {
15681604
"Parks2020_reduced",
15691605
"dRep"])
15701606
.default_value(crate::DEFAULT_QUALITY_FORMULA))
1607+
.arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_run_checkm2_argument)
1608+
.long("run-checkm2")
1609+
.help("Run CheckM2 for genome quality scoring during clustering")
1610+
.action(clap::ArgAction::SetTrue))
1611+
.arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_checkm2_db_path_argument)
1612+
.long("checkm2-db-path")
1613+
.help("Path to CheckM2 database. If not specified, will use $CHECKM2_DB_PATH environment variable if set."))
15711614
.arg(Arg::new(&*GALAH_COMMAND_DEFINITION.dereplication_prethreshold_ani_argument)
15721615
.long("precluster-ani")
15731616
.value_parser(clap::value_parser!(f32))

tests/test_cmdline.rs

Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,10 @@ extern crate assert_cli;
33
#[cfg(test)]
44
mod tests {
55
use assert_cli::Assert;
6+
use std::env;
7+
use std::fs;
8+
use std::path::Path;
9+
use tempfile::tempdir;
610

711
#[test]
812
fn test_completeness_4contamination_quality_score() {
@@ -986,4 +990,98 @@ mod tests {
986990
tests/data/abisko4/73.20110600_S2D.10.fna tests/data/abisko4/73.20110600_S2D.10.fna\n")
987991
.unwrap();
988992
}
993+
994+
#[test]
995+
#[ignore]
996+
fn test_cluster_real_checkm2() {
997+
let checkm2_db_path = std::env::var("CHECKM2DB")
998+
.expect("CHECKM2DB environment variable must be set to run this test");
999+
println!("Using CheckM2 database at {}", checkm2_db_path);
1000+
Assert::main_binary()
1001+
.with_args(&[
1002+
"cluster",
1003+
"--quality-formula",
1004+
"completeness-4contamination",
1005+
"--run-checkm2",
1006+
"--genome-fasta-files",
1007+
"tests/data/set1/1mbp.fna",
1008+
"tests/data/set1/500kb.fna",
1009+
"tests/data/abisko4/73.20120800_S1D.21.fna",
1010+
"tests/data/abisko4/73.20110800_S2M.16.fna",
1011+
"--precluster-method",
1012+
"finch",
1013+
"--output-cluster-definition",
1014+
"/dev/stdout",
1015+
])
1016+
.succeeds()
1017+
.stdout()
1018+
.is("\
1019+
tests/data/abisko4/73.20110800_S2M.16.fna\ttests/data/abisko4/73.20110800_S2M.16.fna\n\
1020+
tests/data/abisko4/73.20110800_S2M.16.fna\ttests/data/abisko4/73.20120800_S1D.21.fna\n\
1021+
tests/data/set1/500kb.fna\ttests/data/set1/500kb.fna\n\
1022+
tests/data/set1/500kb.fna\ttests/data/set1/1mbp.fna\n")
1023+
.unwrap();
1024+
}
1025+
1026+
fn setup_mock_bin(dir: &Path, genomes: &[(String, f64, f64)]) {
1027+
// CheckM2 writes to quality_report.tsv within the directory specified by -o
1028+
let mut checkm2_script = String::from("#!/bin/bash\n");
1029+
checkm2_script.push_str("out=\"\"\n");
1030+
checkm2_script.push_str("while [[ $# -gt 0 ]]; do\n");
1031+
checkm2_script.push_str(" case $1 in\n");
1032+
checkm2_script.push_str(" -o) out=$2; shift 2;;\n");
1033+
checkm2_script.push_str(" *) shift;;\n");
1034+
checkm2_script.push_str(" esac\n");
1035+
checkm2_script.push_str("done\n");
1036+
checkm2_script.push_str("mkdir -p \"$out\"\n");
1037+
checkm2_script.push_str("echo -e 'Name\tCompleteness\tContamination\tCompleteness_Model_Used\tTranslation_Table_Used\tCoding_Density\tContig_N50\tAverage_Gene_Length\tGenome_Size\tGC_Content\tTotal_Coding_Sequences\tTotal_Contigs\tMax_Contig_Length\tAdditional_Notes' > \"$out/quality_report.tsv\"\n");
1038+
for (genome, completeness, contamination) in genomes {
1039+
checkm2_script.push_str(&format!(
1040+
"echo -e '{genome}\t{completeness}\t{contamination}\tGradient Boost (General Model)\t11\t0.885\t5745\t235.3609865470852\t355151\t0.33\t446\t75\t24150\tNone' >> \"$out/quality_report.tsv\"\n"
1041+
));
1042+
}
1043+
let checkm2 = dir.join("checkm2");
1044+
fs::write(&checkm2, checkm2_script).unwrap();
1045+
let _ = std::process::Command::new("chmod")
1046+
.arg("+x")
1047+
.arg(&checkm2)
1048+
.status();
1049+
}
1050+
1051+
#[test]
1052+
fn test_cluster_mock_checkm2() {
1053+
let tmpdir = tempdir().unwrap();
1054+
setup_mock_bin(
1055+
tmpdir.path(),
1056+
&[
1057+
(String::from("500kb"), 90.0, 5.0),
1058+
(String::from("1mbp"), 95.0, 2.0),
1059+
],
1060+
);
1061+
let path = env::var("PATH").unwrap();
1062+
let new_path = format!("{}:{}", tmpdir.path().display(), path);
1063+
1064+
// Provide genomes in reverse order to their quality
1065+
Assert::main_binary()
1066+
.with_env(&[("PATH", new_path)])
1067+
.with_args(&[
1068+
"cluster",
1069+
"--quality-formula",
1070+
"completeness-4contamination",
1071+
"--run-checkm2",
1072+
"--genome-fasta-files",
1073+
"tests/data/set1/500kb.fna",
1074+
"tests/data/set1/1mbp.fna",
1075+
"--output-cluster-definition",
1076+
"/dev/stdout",
1077+
"--checkm2-db-path",
1078+
"/tmp/mockdb",
1079+
])
1080+
.succeeds()
1081+
.stdout()
1082+
.is("\
1083+
tests/data/set1/1mbp.fna\ttests/data/set1/1mbp.fna\n\
1084+
tests/data/set1/1mbp.fna\ttests/data/set1/500kb.fna\n")
1085+
.unwrap();
1086+
}
9891087
}

0 commit comments

Comments
 (0)