diff --git a/README.md b/README.md index 8425be0..226ccc7 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # MAGmax MAGmax is a dereplication tool designed to maximize the recovery of Metagenome-Assembled Genomes (MAGs) through bin Merging and reAssembly. It performs dereplication in three stages: (i) grouping bins based on average sequence identity, (ii) merging bins within each group, and (iii) reassembling the merged bins. -![Maxmax](https://github.com/user-attachments/assets/d5f01672-d894-4554-9329-ca5f324395f8) +![MAGmax](https://github.com/user-attachments/assets/802387bf-ae34-48b5-963f-978a0e2d10d5) ## Inputs MAGmax requires three input directories, @@ -19,7 +19,16 @@ The output directory contains dereplicated bins, and a text file listing the com magmax -b -r -m -f fasta -t 24 magmax -b -r -m -f fasta -t 24 -q quality_report.tsv // if CheckM2 result is already available - magmax -b -r -m -f fasta -t 24 --split // if input bins are not already split by sample id + magmax -b -r -m -f fasta -t 24 --split // if input bins are not already split by sample id + + +## Dereplication without reassembly +MAGmax provides an option to peform dereplication without reassembly using `--no-reassembly` flag. In this mode, MAGmax selects the best bin within each genomic cluster based on a quality score (defined as completeness - 5 * contamination) that also meets the user-defined completeness and contamination thresholds. When this option is enabled, only the bin directory (`-b`) is required as input. + + magmax -b --no-reassembly -f fasta -t 24 + magmax -b --no-reassembly -f fasta -t 24 -q quality_report.tsv // if CheckM2 result is already available + magmax -b --no-reassembly -f fasta -t 24 --split // if input bins are not already split by sample id + ## Installation ### Prerequisites @@ -64,26 +73,28 @@ Option 2: Build from source ## Options -b, --bindir Directory containing fasta files of bins + -r, --readdir + Directory containing read files + -m, --mapdir + Directory containing mapids files -i, --ani ANI for clustering bins (%) [default: 99] -c, --completeness Minimum completeness of bins (%) [default: 50] -p, --purity Mininum purity (1- contamination) of bins (%) [default: 95] - -m, --mapdir - Directory containing mapids files - -r, --readdir - Directory containing read files -f, --format Bin file extension [default: fasta] -t, --threads Number of threads to use [default: 8] + --no-reassembly + Perform dereplication without bin merging and reassembly --split Split clusters into sample-wise bins before processing -q, --qual Quality file produced by CheckM2 (quality_report.tsv) --assembler - assembler choice for reassembly step (spades|megahit) [default: spades, recommended] + Assembler choice for reassembly step (spades|megahit), spades is recommended [default: spades] -h, --help Print help -V, --version @@ -93,6 +104,11 @@ Option 2: Build from source This example test run demonstrates dereplication of bins using the provided toy dataset. In the `test/bins` directory, example bins generated with MetaBAT2 are given. In the `test/reads` directory, paired-end read files for two samples are given and in the `test/mapids` directory, mapid files mapping reads to contigs for each sample are given. Precomputed CheckM2 quality scores for the input bins are given in the `test/quality_report.tsv`. Run the following command to execute the test: magmax -b test/bins -r test/reads -m test/mapids -t 24 -q test/quality_report.tsv + +To run without reassembly, + + magmax -b test/bins --no-reassembly -t 24 -q test/quality_report.tsv // run dereplication without reassembly + After running MAGmax, an output folder named `mags_50comp_95purity` will be created in the `test` directory. This folder contains the following files: - `bins_checkm2_qualities.tsv` — Table summarizing the quality metrics of the dereplicated bins. diff --git a/environment.yml b/environment.yml index 5ea2a12..ffc7c1c 100644 --- a/environment.yml +++ b/environment.yml @@ -8,7 +8,4 @@ dependencies: - megahit - spades - seqtk - - checkm2 - pip - - pip: - - checkm2 diff --git a/src/main.rs b/src/main.rs index 272b343..ab67397 100644 --- a/src/main.rs +++ b/src/main.rs @@ -20,10 +20,15 @@ mod reassemble; // check for valid input paths fn validate_paths(cli: &Cli) -> io::Result<(PathBuf, PathBuf, PathBuf)> { let bindir = utility::validate_path(Some(&cli.bindir), "bindir", &cli.format); - let mapdir = utility::validate_path(Some(&cli.mapdir), "mapdir", "_mapids"); - let readdir = utility::validate_path(Some(&cli.readdir), "readdir", ".fastq"); - Ok((bindir.to_path_buf(), mapdir.to_path_buf(), readdir.to_path_buf())) + if cli.no_reassembly { + Ok((bindir.to_path_buf(), PathBuf::new(), PathBuf::new())) + } else { + let mapdir = utility::validate_path(cli.mapdir.as_ref(), "mapdir", "_mapids"); + let readdir = utility::validate_path(cli.readdir.as_ref(), "readdir", ".fastq"); + Ok((bindir.to_path_buf(), mapdir.to_path_buf(), readdir.to_path_buf())) + } + } #[derive(Parser)] @@ -34,12 +39,14 @@ struct Cli { bindir: PathBuf, /// Directory containing read files - #[arg(short = 'r', long = "readdir", help = "Directory containing read files")] - readdir: PathBuf, + #[arg(short = 'r', long = "readdir", help = "Directory containing read files", + requires_if("false", "no_reassembly"))] + readdir: Option, /// Directory containing mapids files derived from alignment sam/bam files - #[arg(short = 'm', long = "mapdir", help = "Directory containing mapids files")] - mapdir: PathBuf, + #[arg(short = 'm', long = "mapdir", help = "Directory containing mapids files", + requires_if("false", "no_reassembly"))] + mapdir: Option, /// Average Nucleotide Identity cutoff #[arg(short = 'i', long = "ani", default_value_t = 99.0, help = "ANI for clustering bins (%)")] @@ -60,6 +67,11 @@ struct Cli { /// Number of threads to use #[arg(short = 't', long = "threads", default_value_t = 8, help = "Number of threads to use")] threads: usize, + + /// Disable reassembly step + #[arg(long = "no-reassembly", help = "Perform dereplication without bin merging and reassembly", + conflicts_with_all = ["readdir", "mapdir"])] + no_reassembly: bool, /// First split bins before merging (if provided, set to true) #[arg(long = "split", help = "Split clusters into sample-wise bins before processing")] @@ -70,7 +82,7 @@ struct Cli { qual: Option, /// Assembler choice - #[arg(long = "assembler", default_value = "spades", help = "assembler choice for reassembly step (spades|megahit), spades is recommended")] + #[arg(long = "assembler", default_value = "spades", help = "Assembler choice for reassembly step (spades|megahit), spades is recommended")] assembler: String, } @@ -90,6 +102,7 @@ fn main() -> io::Result<()> { let split = cli.split; let assembler: String = cli.assembler; let qual = cli.qual; + let no_reassembly = cli.no_reassembly; let parentdir = bindir.parent().map(PathBuf::from).unwrap_or_else(|| bindir.clone()); info!("Starting MAGmax with parameters:"); @@ -97,27 +110,23 @@ fn main() -> io::Result<()> { info!(" 🔹 ANI Cutoff: {:.1}%", cli.ani); info!(" 🔹 Completeness Cutoff: {:.1}%", cli.completeness_cutoff); info!(" 🔹 Purity/Contamination: {:.1}%/{:.1}%", cli.purity_cutoff, contamination_cutoff); - info!(" 🔹 Map Directory: {:?}", mapdir); - info!(" 🔹 Read Directory: {:?}", readdir); info!(" 🔹 File Format: {}", format); info!(" 🔹 Threads: {}", threads); - info!(" 🔹 Assembler: {}", assembler); - if !["spades", "megahit"].contains(&assembler.as_str()) { - error!("Error: Invalid assembler choice '{}'. Allowed options: 'spades' or 'megahit'.", assembler); - exit(1); + if !no_reassembly { + info!(" 🔹 Map Directory: {:?}", mapdir); + info!(" 🔹 Read Directory: {:?}", readdir); + info!(" 🔹 Assembler: {}", assembler); + if !["spades", "megahit"].contains(&assembler.as_str()) { + error!("Error: Invalid assembler choice '{}'. Allowed options: 'spades' or 'megahit'.", assembler); + exit(1); + } } - let is_paired: bool = utility::check_paired_reads(&readdir); - if is_paired { - info!("Detected paired end \ - reads in separate files as \ - _1.fastq \ - and _2.fastq.") - } else { - info!("Detected single-end reads as .fastq.") + if no_reassembly{ + info!(" 🔸 MAGmax runs dereplication of input bins without bin merging and reassembly"); } - + let binfiles = utility::get_binfiles(&bindir,&format)?; if binfiles.is_empty() { @@ -271,10 +280,10 @@ fn main() -> io::Result<()> { &format, &bin_qualities, &merged_bin_quality, - is_paired, &assembler, completeness_cutoff, contamination_cutoff, + no_reassembly, id, ) .map_err(|e| { @@ -284,15 +293,18 @@ fn main() -> io::Result<()> { }) .expect("Error during processing components"); }); - - { - let merged_bin_qualities = merged_bin_qualities.lock().unwrap(); - for (key, value) in merged_bin_qualities.iter() { - bin_qualities.insert(key.to_string(), value.clone()); + if !no_reassembly { + { + let merged_bin_qualities = merged_bin_qualities.lock().unwrap(); + + for (key, value) in merged_bin_qualities.iter() { + bin_qualities.insert(key.to_string(), value.clone()); + } } + } - + // Final dereplication using skani let _ = merge::drep_finalbins(&resultdir, &bin_qualities, ani_cutoff); @@ -312,10 +324,10 @@ fn process_components( format: &String, bin_qualities: &HashMap, merged_bin_quality: &Arc>>, - is_paired: bool, assembler: &String, completeness_cutoff: f64, contamination_cutoff: f64, + no_reassembly: bool, id: usize, ) -> io::Result<()> { @@ -342,6 +354,34 @@ fn process_components( return Ok(()); } + // If no_reassembly is enabled, just select the best quality bin from each cluster + if no_reassembly { + let mut selected_bin: Option = None; + if let Some((bin_name, _, _)) = + reassemble::find_bestqualitybin(component, &bin_qualities, completeness_cutoff) + { + selected_bin = Some(bin_name); + } + + let _ = reassemble::select_bestqualitybin( + selected_bin, + bindir, + resultdir, + format + ); + + return Ok(()); + } + + let is_paired: bool = utility::check_paired_reads(&readdir); + if is_paired { + info!("Detected paired end \ + reads in separate files as \ + _1.fastq \ + and _2.fastq.") + } else { + info!("Detected single-end reads as .fastq.") + } // eg. selected_binset_path = /0_combined/ let selected_binset_path = resultdir.join(format!("{}_combined", id)); @@ -367,7 +407,6 @@ fn process_components( let all_enriched_scaffolds = utility::read_fasta( &selected_binset_path.join("combined.fasta").to_string_lossy() )?; - let scaffold_inputname:&str = "combined"; diff --git a/src/reassemble.rs b/src/reassemble.rs index d61d3be..ef5c3b4 100644 --- a/src/reassemble.rs +++ b/src/reassemble.rs @@ -53,27 +53,36 @@ pub fn run_reassembly( let mut selected_contamination: Option = None; // Find the best bin based on quality score within the cluster - if let Some((bin_name, completeness, contamination)) = component - .iter() - .filter_map(|bin| { - bin_qualities.get(bin).map(|quality| (bin, quality.completeness, quality.contamination)) - }) - .filter(|(_, completeness, _)| *completeness >= completeness_cutoff) - .max_by(|(_, completeness1, contamination1), (_, completeness2, contamination2)| { + // if let Some((bin_name, completeness, contamination)) = component + // .iter() + // .filter_map(|bin| { + // bin_qualities.get(bin).map(|quality| (bin, quality.completeness, quality.contamination)) + // }) + // .filter(|(_, completeness, _)| *completeness >= completeness_cutoff) + // .max_by(|(_, completeness1, contamination1), (_, completeness2, contamination2)| { - let score1 = completeness1 - (5.0 * contamination1); - let score2 = completeness2 - (5.0 * contamination2); + // let score1 = completeness1 - (5.0 * contamination1); + // let score2 = completeness2 - (5.0 * contamination2); - score1 - .partial_cmp(&score2) - .unwrap_or(std::cmp::Ordering::Equal) - .then_with(|| contamination1.partial_cmp(contamination2).unwrap_or(std::cmp::Ordering::Equal).reverse()) - }) + // score1 + // .partial_cmp(&score2) + // .unwrap_or(std::cmp::Ordering::Equal) + // .then_with(|| contamination1.partial_cmp(contamination2).unwrap_or(std::cmp::Ordering::Equal).reverse()) + // }) + // { + // selected_bin = Some(bin_name.to_string()); + // selected_completeness = Some(completeness); + // selected_contamination = Some(contamination); + // } + + if let Some((bin_name, completeness, contamination)) = + find_bestqualitybin(component, &bin_qualities, completeness_cutoff) { - selected_bin = Some(bin_name.to_string()); + selected_bin = Some(bin_name); selected_completeness = Some(completeness); selected_contamination = Some(contamination); } + let selected_quality_score = selected_completeness .zip(selected_contamination) .map(|(completeness, contamination)| completeness - (5.0 * contamination)) @@ -259,8 +268,36 @@ fn filterscaffold(input_file: &PathBuf) -> io::Result<()> { Ok(()) } +// Find the best bin from a cluster based on the quality score +pub fn find_bestqualitybin( + component: &HashSet, + bin_qualities: &HashMap, + completeness_cutoff: f64, +) -> Option<(String, f64, f64)> { + component + .iter() + .filter_map(|bin| { + bin_qualities.get(bin).map(|quality| { + (bin.clone(), quality.completeness, quality.contamination) + }) + }) + .filter(|(_, completeness, _)| *completeness >= completeness_cutoff) + .max_by(|(_, completeness1, contamination1), (_, completeness2, contamination2)| { + let score1 = completeness1 - (5.0 * contamination1); + let score2 = completeness2 - (5.0 * contamination2); + + score1 + .partial_cmp(&score2) + .unwrap_or(std::cmp::Ordering::Equal) + .then_with(|| contamination1 + .partial_cmp(contamination2) + .unwrap_or(std::cmp::Ordering::Equal) + .reverse()) + }) +} + // Select the best bin among the cluster members and reassembled bin -fn select_bestqualitybin( +pub fn select_bestqualitybin( selected_bin: Option, bindir: &PathBuf, outputpath: &PathBuf,