@@ -11,6 +11,7 @@ use crate::skani::SkaniClusterer;
1111use crate :: skani:: SkaniPreclusterer ;
1212use crate :: ClusterDistanceFinder ;
1313use crate :: PreclusterDistanceFinder ;
14+ use crate :: QualityFinder ;
1415use crate :: SortedPairGenomeDistanceCache ;
1516use bird_tool_utils:: clap_utils:: * ;
1617use bird_tool_utils:: clap_utils:: { default_roff, monospace_roff} ;
@@ -117,6 +118,8 @@ pub struct GalahClustererCommandDefinition {
117118 pub dereplication_ani_argument : String ,
118119 pub dereplication_prethreshold_ani_argument : String ,
119120 pub dereplication_quality_formula_argument : String ,
121+ pub dereplication_run_checkm2_argument : String ,
122+ pub dereplication_checkm2_db_path_argument : String ,
120123 pub dereplication_precluster_method_argument : String ,
121124 pub dereplication_cluster_method_argument : String ,
122125 pub dereplication_aligned_fraction_argument : String ,
@@ -140,6 +143,8 @@ lazy_static! {
140143 dereplication_ani_argument: "ani" . to_string( ) ,
141144 dereplication_prethreshold_ani_argument: "precluster-ani" . to_string( ) ,
142145 dereplication_quality_formula_argument: "quality-formula" . to_string( ) ,
146+ dereplication_run_checkm2_argument: "run-checkm2" . to_string( ) ,
147+ dereplication_checkm2_db_path_argument: "checkm2-db-path" . to_string( ) ,
143148 dereplication_precluster_method_argument: "precluster-method" . to_string( ) ,
144149 dereplication_cluster_method_argument: "cluster-method" . to_string( ) ,
145150 dereplication_aligned_fraction_argument: "min-aligned-fraction" . to_string( ) ,
@@ -284,6 +289,15 @@ pub fn add_dereplication_filtering_parameters_to_section(section: Section) -> Se
284289 "Ignore genomes with more contamination than \
285290 this percentage. [default: not set]",
286291 ) )
292+ . flag ( Flag :: new ( ) . long ( "--run-checkm2" ) . help (
293+ "Run CheckM2 to generate quality scoring used for clustering. Requires \
294+ --checkm2-db-path or CHECKM2DB env variable to be set.",
295+ ) )
296+ . option (
297+ Opt :: new ( "DB_PATH" ) . long ( "--checkm2-db-path" ) . help (
298+ "Path to CheckM2 database (required for running CheckM2) \
299+ [default: from CHECKM2DB environment variable]",
300+ ) )
287301}
288302
289303pub fn add_dereplication_clustering_parameters_to_section (
@@ -838,9 +852,10 @@ pub fn filter_genomes_through_checkm<'a>(
838852 match clap_matches. contains_id ( "checkm-tab-table" )
839853 || clap_matches. contains_id ( "genome-info" )
840854 || clap_matches. contains_id ( "checkm2-quality-report" )
855+ || clap_matches. contains_id ( "run-checkm2" )
841856 {
842857 false => {
843- warn ! ( "Since CheckM input is missing , genomes are not being ordered by quality. Instead the order of their input is being used" ) ;
858+ warn ! ( "Since CheckM input has not been provided and CheckM2 has been disabled , genomes are not being ordered by quality. Instead the order of their input is being used" ) ;
844859 Ok ( genome_fasta_files. iter ( ) . map ( |s| & * * s) . collect ( ) )
845860 }
846861 true => {
@@ -882,6 +897,27 @@ pub fn filter_genomes_through_checkm<'a>(
882897 )
883898 . expect ( "Error parsing genomeInfo file" ) ,
884899 }
900+ } else if clap_matches. contains_id ( "run-checkm2" ) {
901+ // Run CheckM2 as in analyse
902+ let db_path = clap_matches. get_one :: < String > ( "checkm2-db-path" )
903+ . map ( |s| s. to_string ( ) )
904+ . or_else ( || std:: env:: var ( "CHECKM2DB" ) . ok ( ) )
905+ . expect ( "CheckM2 database path must be provided via --checkm2-db-path or CHECKM2DB env var" ) ;
906+ use crate :: checkm2:: CheckM2Analyser ;
907+ let tmpdir = tempfile:: tempdir ( ) . expect ( "Failed to create tempdir for CheckM2" ) ;
908+ let tmp_path = tmpdir. path ( ) ;
909+ let mut analyser = CheckM2Analyser :: new ( db_path) ;
910+ analyser. prepare_comp_cont ( genome_fasta_files, 1 , tmp_path) ;
911+ CheckMResultEnum :: CheckM2Result {
912+ result : {
913+ let quality_report_path =
914+ tmp_path. join ( "checkm2" ) . join ( "quality_report.tsv" ) ;
915+ checkm:: CheckM2QualityReport :: read_file_path (
916+ quality_report_path. to_str ( ) . unwrap ( ) ,
917+ )
918+ . unwrap ( )
919+ } ,
920+ }
885921 } else {
886922 panic ! ( "Programming error" ) ;
887923 } ;
@@ -1568,6 +1604,13 @@ pub fn add_cluster_subcommand(app: clap::Command) -> clap::Command {
15681604 "Parks2020_reduced" ,
15691605 "dRep" ] )
15701606 . default_value ( crate :: DEFAULT_QUALITY_FORMULA ) )
1607+ . arg ( Arg :: new ( & * GALAH_COMMAND_DEFINITION . dereplication_run_checkm2_argument )
1608+ . long ( "run-checkm2" )
1609+ . help ( "Run CheckM2 for genome quality scoring during clustering" )
1610+ . action ( clap:: ArgAction :: SetTrue ) )
1611+ . arg ( Arg :: new ( & * GALAH_COMMAND_DEFINITION . dereplication_checkm2_db_path_argument )
1612+ . long ( "checkm2-db-path" )
1613+ . help ( "Path to CheckM2 database. If not specified, will use $CHECKM2_DB_PATH environment variable if set." ) )
15711614 . arg ( Arg :: new ( & * GALAH_COMMAND_DEFINITION . dereplication_prethreshold_ani_argument )
15721615 . long ( "precluster-ani" )
15731616 . value_parser ( clap:: value_parser!( f32 ) )
0 commit comments