Skip to content

Commit 73b4c65

Browse files
committed
add checkm2 quality checking
1 parent 9be59f7 commit 73b4c65

File tree

10 files changed

+2133
-76
lines changed

10 files changed

+2133
-76
lines changed

galah.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,4 @@ dependencies:
99
- extern
1010
- barrnap >=0.9
1111
- trnascan-se >=2.0.12
12+
- checkm2 >=1.1.0

pixi.lock

Lines changed: 1631 additions & 33 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pixi.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ skani = ">=0.2.2"
1515
fastani = ">=1.3"
1616
"barrnap" = ">=0.9"
1717
"trnascan-se" = ">=2.0.12"
18+
checkm2 = ">=1.1.0"
1819

1920
# x86_64-specific dependencies
2021
[feature.main.dependencies]

src/analyse.rs

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,44 +1,63 @@
1+
use crate::QualityFinder;
12
use crate::RrnaFinder;
23
use crate::TrnaFinder;
34
use tempfile::tempdir;
45

56
#[derive(Debug, Clone)]
67
pub struct GenomeOutput {
8+
pub completeness: f64,
9+
pub contamination: f64,
710
pub r5s: usize,
811
pub r16s: usize,
912
pub r23s: usize,
1013
pub trnas: usize,
1114
pub mimag_quality: String,
1215
}
1316

14-
pub fn analyse<R: RrnaFinder, T: TrnaFinder>(
17+
pub fn analyse<Q: QualityFinder, R: RrnaFinder, T: TrnaFinder>(
1518
genomes: &[String],
19+
threads: usize,
20+
quality_finder: &mut Q,
1621
rrna_finder: &R,
1722
trna_finder: &T,
1823
) -> std::collections::HashMap<String, GenomeOutput> {
24+
let quality_method = quality_finder.method_name();
1925
let rrna_method = rrna_finder.method_name();
2026
let trna_method = trna_finder.method_name();
21-
2227
info!(
23-
"Running {} and {} on provided genomes...",
24-
rrna_method, trna_method
28+
"Running {}, {} and {} on provided genomes...",
29+
quality_method, rrna_method, trna_method
2530
);
31+
2632
let tmpdir = tempdir().expect("Failed to create tempdir");
2733
let tmp_path = tmpdir.path();
2834

35+
quality_finder.prepare_comp_cont(genomes, threads, tmp_path);
36+
2937
use std::collections::HashMap;
3038
let mut genome_outputs: HashMap<String, GenomeOutput> = HashMap::new();
3139
for genome_path in genomes {
40+
let (completeness, contamination) = quality_finder.find_comp_cont(genome_path);
3241
let (r5s, r16s, r23s) = rrna_finder.find_rrnas(genome_path, tmp_path);
3342
let trnas = trna_finder.find_trnas(genome_path, tmp_path);
34-
let mimag_quality = if r5s == 0 || r16s == 0 || r23s == 0 || trnas < 18 {
43+
let mimag_quality = if completeness < 50.0 || contamination >= 10.0 {
44+
"Low quality"
45+
} else if completeness <= 90.0
46+
|| contamination >= 5.0
47+
|| r5s == 0
48+
|| r16s == 0
49+
|| r23s == 0
50+
|| trnas < 18
51+
{
3552
"Medium quality"
3653
} else {
3754
"High quality"
3855
};
3956
genome_outputs.insert(
4057
genome_path.to_string(),
4158
GenomeOutput {
59+
completeness,
60+
contamination,
4261
r5s,
4362
r16s,
4463
r23s,
@@ -47,6 +66,5 @@ pub fn analyse<R: RrnaFinder, T: TrnaFinder>(
4766
},
4867
);
4968
}
50-
5169
genome_outputs
5270
}

src/analyse_argument_parsing.rs

Lines changed: 80 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@ use std::io::Write;
22

33
use crate::analyse::GenomeOutput;
44
use crate::barrnap::BarrnapAnalyser;
5+
use crate::checkm2::CheckM2Analyser;
56
use crate::trnascan::TrnascanAnalyser;
7+
use crate::QualityFinder;
68
use crate::RrnaFinder;
79
use crate::TrnaFinder;
810
use bird_tool_utils::clap_utils::*;
@@ -11,6 +13,33 @@ use bird_tool_utils_man::prelude::{Author, Flag, Manual, Opt, Section};
1113
use clap::*;
1214
use std::collections::HashMap;
1315

16+
pub enum QualityAnalyser {
17+
CheckM2(crate::checkm2::CheckM2Analyser),
18+
}
19+
20+
impl QualityFinder for QualityAnalyser {
21+
fn prepare_comp_cont(
22+
&mut self,
23+
genome_paths: &[String],
24+
threads: usize,
25+
tmp_path: &std::path::Path,
26+
) {
27+
match self {
28+
QualityAnalyser::CheckM2(a) => a.prepare_comp_cont(genome_paths, threads, tmp_path),
29+
}
30+
}
31+
fn find_comp_cont(&self, genome_path: &str) -> (f64, f64) {
32+
match self {
33+
QualityAnalyser::CheckM2(a) => a.find_comp_cont(genome_path),
34+
}
35+
}
36+
fn method_name(&self) -> &str {
37+
match self {
38+
QualityAnalyser::CheckM2(a) => a.method_name(),
39+
}
40+
}
41+
}
42+
1443
pub enum RrnaAnalyser {
1544
Barrnap(BarrnapAnalyser),
1645
}
@@ -48,32 +77,40 @@ impl TrnaFinder for TrnaAnalyser {
4877

4978
pub struct GalahAnalyser<'a> {
5079
pub genome_fasta_files: &'a [std::string::String],
80+
pub threads: usize,
81+
pub quality_analyser: QualityAnalyser,
5182
pub rrna_analyser: RrnaAnalyser,
5283
pub trna_analyser: TrnaAnalyser,
5384
}
5485

5586
impl GalahAnalyser<'_> {
56-
pub fn analyse(&self) -> std::collections::HashMap<String, GenomeOutput> {
87+
pub fn analyse(&mut self) -> std::collections::HashMap<String, GenomeOutput> {
5788
crate::analyse::analyse(
5889
self.genome_fasta_files,
90+
self.threads,
91+
&mut self.quality_analyser,
5992
&self.rrna_analyser,
6093
&self.trna_analyser,
6194
)
6295
}
6396
}
6497

6598
pub struct GalahAnalyserCommandDefinition {
99+
pub quality_method_argument: String,
66100
pub rrna_method_argument: String,
67101
pub trna_method_argument: String,
68102
pub output_mimag_summary_argument: String,
103+
pub checkm2_db_path_argument: String,
69104
}
70105

71106
lazy_static! {
72107
static ref ANALYSE_COMMAND_DEFINITION: GalahAnalyserCommandDefinition = {
73108
GalahAnalyserCommandDefinition {
109+
quality_method_argument: "quality-method".to_string(),
74110
rrna_method_argument: "rrna-method".to_string(),
75111
trna_method_argument: "trna-method".to_string(),
76112
output_mimag_summary_argument: "output-mimag-summary".to_string(),
113+
checkm2_db_path_argument: "checkm2-db-path".to_string(),
77114
}
78115
};
79116
}
@@ -96,9 +133,11 @@ lazy_static! {
96133
97134
{} analyse --genome-fasta-directory input_genomes/
98135
--output-mimag-summary mimag_summary.tsv
136+
--checkm2-db-path /path/to/checkm2_db
99137
100138
{}
101139
140+
CHECKM2DB=/path/to/checkm2_db
102141
{} analyse --genome-fasta-list genomes.txt
103142
--output-mimag-summary mimag_summary.tsv
104143
@@ -115,7 +154,8 @@ See {} analyse --full-help for further options and further detail.
115154
ansi_term::Colour::Green.paint("Analyse (determine MIMAG status of) genomes"),
116155
ansi_term::Colour::Purple.paint(
117156
"Example: Analyse the rRNA/tRNA content of a directory of .fna\n\
118-
FASTA files and output a summary of gene counts and MIMAG status\n\
157+
FASTA files, using CheckM2 database specified by argument,\n\
158+
and output a summary of gene counts and MIMAG status\n\
119159
to mimag_summary.tsv:"
120160
.to_string(),
121161
),
@@ -125,7 +165,8 @@ See {} analyse --full-help for further options and further detail.
125165
.and_then(|s| s.into_string().ok())
126166
.expect("Failed to find running program basename"),
127167
ansi_term::Colour::Purple.paint(
128-
"Example: Analyse a set of genomes with paths specified in genomes.txt, and\n\
168+
"Example: Analyse a set of genomes with paths specified in genomes.txt,\n\
169+
using CheckM2 database specified by environment variable, and\n\
129170
output the MIMAG summary to mimag_summary.tsv:"
130171
),
131172
std::env::current_exe()
@@ -248,6 +289,20 @@ pub fn add_analyse_subcommand(app: clap::Command) -> clap::Command {
248289
.value_parser(crate::TRNA_METHODS)
249290
.default_value(crate::DEFAULT_TRNA_METHOD)
250291
.help("Method for tRNA analysis"),
292+
)
293+
.arg(
294+
Arg::new(&*ANALYSE_COMMAND_DEFINITION.quality_method_argument)
295+
.long("quality-method")
296+
.value_parser(crate::QUALITY_METHODS)
297+
.default_value(crate::DEFAULT_QUALITY_METHOD)
298+
.help("Method for quality analysis"),
299+
)
300+
.arg(
301+
Arg::new(&*ANALYSE_COMMAND_DEFINITION.checkm2_db_path_argument)
302+
.long("checkm2-db-path")
303+
.value_name("CHECKM2DB")
304+
.help("Path to CheckM2 database (required for checkm2 quality method) [default: from CHECKM2DB environment variable]")
305+
.required(false),
251306
);
252307

253308
analyse_subcommand =
@@ -329,7 +384,7 @@ pub fn run_analyse_subcommand(
329384

330385
let genome_fasta_files: Vec<String> = parse_list_of_genome_fasta_files(m, true).unwrap();
331386

332-
let galah = generate_galah_analyser(&genome_fasta_files, m, &ANALYSE_COMMAND_DEFINITION)
387+
let mut galah = generate_galah_analyser(&genome_fasta_files, m, &ANALYSE_COMMAND_DEFINITION)
333388
.expect("Failed to parse galah analyse arguments correctly");
334389

335390
// Open file handles here so errors are caught before CPU-heavy commands
@@ -347,6 +402,21 @@ fn generate_galah_analyser<'a>(
347402
m: &ArgMatches,
348403
command_definition: &GalahAnalyserCommandDefinition,
349404
) -> Result<GalahAnalyser<'a>, String> {
405+
let threads = *m.get_one::<u16>("threads").unwrap() as usize;
406+
let checkm2_db_path = m
407+
.get_one::<String>(&command_definition.checkm2_db_path_argument)
408+
.map(|s| s.to_string())
409+
.or_else(|| std::env::var("CHECKM2DB").ok())
410+
.unwrap_or_default();
411+
412+
let quality_analyser = match m
413+
.get_one::<String>(&command_definition.quality_method_argument)
414+
.map(|s| s.as_str())
415+
{
416+
Some("checkm2") => QualityAnalyser::CheckM2(CheckM2Analyser::new(checkm2_db_path)),
417+
_ => return Err("Invalid quality method specified".to_string()),
418+
};
419+
350420
let rrna_analyser = match m
351421
.get_one::<String>(&command_definition.rrna_method_argument)
352422
.map(|s| s.as_str())
@@ -365,6 +435,8 @@ fn generate_galah_analyser<'a>(
365435

366436
Ok(GalahAnalyser {
367437
genome_fasta_files,
438+
threads,
439+
quality_analyser,
368440
rrna_analyser,
369441
trna_analyser,
370442
})
@@ -378,15 +450,17 @@ fn write_analyse_outputs(
378450
if let Some(mut f) = output_definitions.output_mimag_summary {
379451
writeln!(
380452
f,
381-
"genome\trRNA_5S\trRNA_16S\trRNA_23S\ttRNAs\tMIMAG_quality",
453+
"genome\tcompleteness\tcontamination\trRNA_5S\trRNA_16S\trRNA_23S\ttRNAs\tMIMAG_quality",
382454
)
383455
.unwrap();
384456
for genome in genome_fasta_files {
385457
if let Some(output_data) = analysis.get(&*genome) {
386458
writeln!(
387459
f,
388-
"{genome}\t{r5s}\t{r16s}\t{r23s}\t{trnas}\t{mimag_quality}",
460+
"{genome}\t{completeness}\t{contamination}\t{r5s}\t{r16s}\t{r23s}\t{trnas}\t{mimag_quality}",
389461
genome = genome,
462+
completeness = output_data.completeness,
463+
contamination = output_data.contamination,
390464
r5s = output_data.r5s,
391465
r16s = output_data.r16s,
392466
r23s = output_data.r23s,

src/barrnap.rs

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ impl RrnaFinder for BarrnapAnalyser {
1111
}
1212

1313
fn method_name(&self) -> &str {
14-
"barrnap"
14+
"Barrnap"
1515
}
1616
}
1717

@@ -51,13 +51,25 @@ pub fn run_barrnap(genome_path: &str, kingdom: &str, threads: usize, out_dir: &P
5151
])
5252
.output()
5353
.expect("Failed to run barrnap");
54+
55+
if !output.status.success() {
56+
info!(
57+
"Barrnap run on {} failed with {}.\nstdout:\n{}\nstderr:\n{}",
58+
genome_path,
59+
output.status,
60+
String::from_utf8_lossy(&output.stdout),
61+
String::from_utf8_lossy(&output.stderr)
62+
);
63+
panic!("Barrnap did not run successfully");
64+
}
65+
5466
fs::write(&gff_path, &output.stdout).expect("Failed to write barrnap output");
5567
gff_path
5668
}
5769

5870
/// Parse barrnap GFF file and count 5S, 16S, 23S rRNAs
5971
pub fn parse_rrna_types(gff_path: &str) -> (usize, usize, usize) {
60-
let content = std::fs::read_to_string(gff_path).unwrap_or_default();
72+
let content = std::fs::read_to_string(gff_path).unwrap();
6173
let mut r5s = 0;
6274
let mut r16s = 0;
6375
let mut r23s = 0;
@@ -82,7 +94,7 @@ pub fn parse_rrna_types(gff_path: &str) -> (usize, usize, usize) {
8294
}
8395

8496
pub fn parse_barrnap_hits(gff_path: &str) -> usize {
85-
let content = std::fs::read_to_string(gff_path).unwrap_or_default();
97+
let content = std::fs::read_to_string(gff_path).unwrap();
8698
content.lines().filter(|l| !l.starts_with('#')).count()
8799
}
88100

0 commit comments

Comments
 (0)