diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml index 511b3b1d3..0a0579764 100644 --- a/.github/workflows/linting.yml +++ b/.github/workflows/linting.yml @@ -28,7 +28,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install nf-core==2.2 + pip install nf-core==2.14.1 - name: Run nf-core lint env: diff --git a/.markdownlint.yml b/.markdownlint.yml deleted file mode 100644 index 9e605fcfa..000000000 --- a/.markdownlint.yml +++ /dev/null @@ -1,14 +0,0 @@ -# Markdownlint configuration file -default: true -line-length: false -ul-indent: - indent: 4 -no-duplicate-header: - siblings_only: true -no-inline-html: - allowed_elements: - - img - - p - - kbd - - details - - summary diff --git a/.nf-core.yml b/.nf-core.yml index 3c4b505cb..a1298cc75 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -14,7 +14,6 @@ lint: - docs/images/nf-core-autometa_logo_dark.png - .github/ISSUE_TEMPLATE/bug_report.md - .github/ISSUE_TEMPLATE/feature_request.md - files_unchanged: - manifest - .github/CONTRIBUTING.md @@ -30,10 +29,11 @@ lint: - LICENSE - .github/PULL_REQUEST_TEMPLATE.md - lib/NfcoreTemplate.groovy - actions_ci: - .github/workflows/ci.yml - schema_lint: true template_strings: false - nextflow_config: false + nextflow_config: + - manifest.name + - manifest.homePage +repository_type: pipeline diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 000000000..d0e7ae589 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,9 @@ +email_template.html +.nextflow* +work/ +data/ +results/ +.DS_Store +testing/ +testing* +*.pyc diff --git a/.prettierrc.yml b/.prettierrc.yml new file mode 100644 index 000000000..c81f9a766 --- /dev/null +++ b/.prettierrc.yml @@ -0,0 +1 @@ +printWidth: 120 diff --git a/Dockerfile b/Dockerfile index edc2f042f..192632e9d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -20,7 +20,7 @@ LABEL maintainer="jason.kwan@wisc.edu" # along with Autometa. If not, see . RUN apt-get update --allow-releaseinfo-change \ - && apt-get install -y procps make \ + && apt-get install -y procps make curl \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* @@ -28,6 +28,8 @@ COPY autometa-env.yml ./ RUN mamba env update -n base --file=autometa-env.yml \ && mamba clean --all -y +RUN mamba env update -n base --file=autometa-env.yml \ + && mamba clean --all -y COPY . /Autometa WORKDIR /Autometa @@ -42,6 +44,11 @@ RUN hmmpress -f autometa/databases/markers/bacteria.single_copy.hmm \ && autometa-config --section databases --option base --value ${DB_DIR} \ && echo "databases base directory set in ${DB_DIR}/" + +# make the /scratch/dbs directory available to anyone +RUN chmod -R 755 /scratch/dbs + + RUN echo "Testing autometa import" \ && python -c "import autometa" @@ -67,3 +74,5 @@ RUN echo "Checking autometa entrypoints" \ && autometa-binning-ldm-loginfo -h > /dev/null \ && autometa-benchmark -h > /dev/null \ && autometa-download-dataset -h > /dev/null + +ENV NUMBA_CACHE_DIR=/tmp diff --git a/autometa-env.yml b/autometa-env.yml index 6cc97229f..a63ea1d68 100644 --- a/autometa-env.yml +++ b/autometa-env.yml @@ -4,18 +4,23 @@ channels: - bioconda - defaults dependencies: + - aria2 - attrs # test-data requirement - bedtools - biopython>=1.82 - bowtie2 + - curl - diamond>=2.0 + - gzip - gdown - hmmer + - joblib>=1.1.0 # See https://stackoverflow.com/a/73830525/12671809 - numba>=0.47 - numpy>=1.13 - pandas>=1.5 - parallel - pip + - procps-ng # required by nextflow - prodigal # NOTE: 2.5 and 2.6 output format is different for sequence headers - python-annoy>=1.11 # required for trimap installation. - requests diff --git a/autometa/common/kmers.py b/autometa/common/kmers.py index ae7007f84..9fd37f56f 100644 --- a/autometa/common/kmers.py +++ b/autometa/common/kmers.py @@ -586,9 +586,12 @@ def embed( f"{method} not in embedding methods. Choices: {', '.join(choices)}" ) # PCA - n_samples, n_components = df.shape + # Drop any rows that all cols contain NaN. This may occur if the contig length is below the k-mer size X = df.dropna(axis="index", how="all").fillna(0).to_numpy() + n_samples, n_components = df.shape + + logger.warning(f"n_samples: {n_samples} n_components: {n_components}") # Set random state using provided seed random_state = np.random.RandomState(seed) if isinstance(pca_dimensions, str): @@ -599,11 +602,15 @@ def embed( f"pca_dimensions must be an integer! given: {pca_dimensions}" ) if n_components > pca_dimensions and pca_dimensions != 0: + if n_samples < pca_dimensions: + logging.error( + f"n_samples ({n_samples}) is less than pca_dimensions ({pca_dimensions}), lowering pca_dimensions to {min(n_samples, pca_dimensions)} ." + ) + pca_dimensions = min(n_samples, pca_dimensions) logger.debug( f"Performing decomposition with PCA (seed {seed}): {n_components} to {pca_dimensions} dims" ) X = PCA(n_components=pca_dimensions, random_state=random_state).fit_transform(X) - # X = PCA(n_components='mle').fit_transform(X) n_samples, n_components = X.shape logger.debug(f"{method}: {n_samples} data points and {n_components} dimensions") diff --git a/autometa/config/databases.py b/autometa/config/databases.py index 14ea42c3e..eb45c8779 100644 --- a/autometa/config/databases.py +++ b/autometa/config/databases.py @@ -1,4 +1,3 @@ -#!/usr/bin/env python """ # License: GNU Affero General Public License v3 or later # A copy of GNU AGPL v3 should have been included in this software package in LICENSE.txt. diff --git a/autometa/taxonomy/download_gtdb_files.py b/autometa/taxonomy/download_gtdb_files.py index ceabc7225..b914e1967 100644 --- a/autometa/taxonomy/download_gtdb_files.py +++ b/autometa/taxonomy/download_gtdb_files.py @@ -9,6 +9,8 @@ from tqdm import tqdm +from autometa.config.utilities import DEFAULT_FPATH + # Set up logger logger = logging.getLogger(__name__) @@ -312,3 +314,39 @@ def download_and_format(gtdb_host, gtdb_version, single_dir, force=False): "aa_reps_path": aa_reps_path, "combined_gtdb_fasta": combined_gtdb_fasta, } + + + +def main(): + import argparse + import logging as logger + + logger.basicConfig( + format="[%(asctime)s %(levelname)s] %(name)s: %(message)s", + datefmt="%m/%d/%Y %I:%M:%S %p", + level=logger.DEBUG, + ) + parser = argparse.ArgumentParser( + description="Download GTDB files", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--version", + help="GTDB version to download, 'latest' to get the latest version, otherwise specify a version number.", + default="220", + ) + parser.add_argument( + "--host", + help="GTDB host to download files from.", + default="data.gtdb.ecogenomic.org", + ) + parser.add_argument( + "--outdir", + help="Directory to save the downloaded files.", + required=True + ) + args = parser.parse_args() + download_and_format(gtdb_host=args.host, gtdb_version=args.version, single_dir=args.outdir) + +if __name__ == "__main__": + main() diff --git a/autometa/taxonomy/gtdb.py b/autometa/taxonomy/gtdb.py index 6af8fd92e..24ee04e55 100644 --- a/autometa/taxonomy/gtdb.py +++ b/autometa/taxonomy/gtdb.py @@ -63,7 +63,7 @@ def __init__(self, dbdir: str, verbose: bool = True, config=DEFAULT_CONFIG): self.names_fpath = os.path.join(dbdir, "names.dmp") self.merged_fpath = os.path.join(dbdir, "merged.dmp") self.delnodes_fpath = os.path.join(dbdir, "delnodes.dmp") - self.verify_databases() + # self.verify_databases() self.names = self.parse_names() self.nodes = self.parse_nodes() self.merged = self.parse_merged() diff --git a/autometa/validation/datasets.py b/autometa/validation/datasets.py index 3adc8f4bd..0261bd74d 100755 --- a/autometa/validation/datasets.py +++ b/autometa/validation/datasets.py @@ -63,7 +63,10 @@ def download( file_id = df.loc[(community_size, file_name), "file_id"] file_id_filepath = os.path.join(community_size_outdir, file_name) url = f"https://drive.google.com/uc?id={file_id}" - + # if the file already exists, skip downloading + if os.path.exists(file_id_filepath): + logger.info(f"File {file_name} already exists in {community_size_outdir}. Skipping download.") + continue gdown.download(url, file_id_filepath) diff --git a/bin/mock_data_report.R b/bin/mock_data_report.R new file mode 100755 index 000000000..a363f1c96 --- /dev/null +++ b/bin/mock_data_report.R @@ -0,0 +1,28 @@ +#!/usr/bin/env Rscript + +args = commandArgs(trailingOnly=TRUE) + +rmarkdown::render( + input=args[[1]], + params=list( + bins_path=args[[2]], + assembly_to_locus_path=args[[2]], + assembly_report_path=args[[3]], + genus=FALSE + ), + knit_root_dir=getwd(), + output_dir=getwd(), + output_file="mock_data_report_by_assembly.html" +) +rmarkdown::render( + input=args[[1]], + params=list( + bins_path= args[[2]], + assembly_to_locus_path = args[[2]], + assembly_report_path = args[[3]], + genus=TRUE + ), + knit_root_dir=getwd(), + output_dir=getwd(), + output_file="mock_data_report_by_genus.html" +) diff --git a/conf/base.config b/conf/base.config index 88616e3af..6601fcdd5 100644 --- a/conf/base.config +++ b/conf/base.config @@ -26,22 +26,22 @@ process { // adding in your local modules too. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_low { - cpus = { check_max( 1 * task.attempt, 'cpus' ) } - memory = { check_max( 2.GB * task.attempt, 'memory' ) } + cpus = { check_max( 1 ) } + memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } } withLabel:process_medium { - cpus = { check_max( 8 * task.attempt, 'cpus' ) } - memory = { check_max( 8.GB * task.attempt, 'memory' ) } + cpus = { check_max( 6 * task.attempt, 'cpus' ) } + memory = { check_max( 36.GB * task.attempt, 'memory' ) } time = { check_max( 8.h * task.attempt, 'time' ) } } withLabel:process_high { - cpus = { check_max( 16 * task.attempt, 'cpus' ) } - memory = { check_max( 16.GB * task.attempt, 'memory' ) } - time = { check_max( 16.h * task.attempt, 'time' ) } + cpus = { check_max( 12 * task.attempt, 'cpus' ) } + memory = { check_max( 72.GB * task.attempt, 'memory' ) } + time = { check_max( 24.h * task.attempt, 'time' ) } } withLabel:process_long { - time = { check_max( 20.h * task.attempt, 'time' ) } + time = { check_max( 48.h * task.attempt, 'time' ) } } withLabel:process_high_memory { memory = { check_max( 200.GB * task.attempt, 'memory' ) } @@ -53,4 +53,7 @@ process { errorStrategy = 'retry' maxRetries = 2 } + withName:CUSTOM_DUMPSOFTWAREVERSIONS { + cache = false + } } diff --git a/conf/modules.config b/conf/modules.config index 239c6b059..6a911643e 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -1,128 +1,183 @@ /* ======================================================================================== - Config file for defining DSL2 per module options + Config file for defining DSL2 per module options and publishing paths ======================================================================================== Available keys to override module options: - args = Additional arguments appended to command in module. - args2 = Second set of arguments appended to command in module (multi-tool modules). - args3 = Third set of arguments appended to command in module (multi-tool modules). - publish_dir = Directory to publish results. - publish_by_meta = Groovy list of keys available in meta map to append as directories to "publish_dir" path - If publish_by_meta = true - Value of ${meta['id']} is appended as a directory to "publish_dir" path - If publish_by_meta = ['id', 'custompath'] - If "id" is in meta map and "custompath" isn't then "${meta['id']}/custompath/" - is appended as a directory to "publish_dir" path - If publish_by_meta = false / null - No directories are appended to "publish_dir" path - publish_files = Groovy map where key = "file_ext" and value = "directory" to publish results for that file extension - The value of "directory" is appended to the standard "publish_dir" path as defined above. - If publish_files = null (unspecified) - All files are published. - If publish_files = false - No files are published. - suffix = File name suffix for output files. + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. ---------------------------------------------------------------------------------------- */ -params { - modules { - 'count_kmers_options' { - publish_by_meta = ['id'] - publish_dir = "count_kmer_analysis" - } - 'normalize_kmers_options' { - publish_by_meta = ['id'] - publish_dir = "normalize_kmer_analysis" - } - 'embed_kmers_options' { - publish_by_meta = ['id'] - publish_dir = "embed_kmer_analysis" - } - 'diamond_blastp_options' { - args = "--evalue 1e-5 --max-target-seqs 200 -b 6 --outfmt 6" - publish_by_meta = ['id'] - publish_dir = "diamond_blastp_results" - } - 'get_genomes_for_mock' { - args = "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt" - args2 = 'GCF_000734955.1|GCF_900448115.1|GCF_015751765.1' - publish_files = false - } - 'hmmsearch_options' { - args = "-Z 150 --cpu 1 --seed 42" - args2 = "" - } - 'hmmsearch_filter_options' { - args = "" - } - 'merge_hmmsearch_options'{ - publish_by_meta = ['id'] - publish_dir = "hmmsearch" - } - 'majority_vote_options' { - publish_by_meta = ['id'] - } - 'merge_kmers_embedded_options'{ - publish_by_meta = ['id'] - publish_dir = "kmers_embedded" - } - 'merge_kmers_normalized_options'{ - publish_by_meta = ['id'] - publish_dir = "kmers_normalized" - } - 'mock_data_report'{ - publish_by_meta = ['id'] - publish_dir = "mock_data_reports" - } - 'prodigal_options' { - publish_by_meta = ['id'] - args = "-p meta -m" - publish_dir = "prodigal" - } - 'diamond_makedb_options' { - publish_by_meta = ['id'] - args = "" - } - 'align_reads_options' { - args = "" - args2 = "-q --phred33 --very-sensitive --no-unal" - publish_by_meta = ['id'] - publish_dir = "align_reads" - } - 'samtools_viewsort_options' { - args = "" - args2 = "" - publish_by_meta = ['id'] - publish_dir = "samtools_sort" - } - 'bedtools_genomecov_options' { - args = "" - args2 = "" - publish_by_meta = ['id'] - publish_dir = "genome_coverage" - } - 'seqkit_split_options' { + + +process { + withName:'ALIGN_READS'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + ext.args = '' + ext.args2 = '-q --phred33 --very-sensitive --no-unal' + } + withName:'BEDTOOLS_GENOMECOV'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + withName:'BINNING'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + withName:'BINNING_SUMMARY'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + withName:'COUNT_KMERS'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}/${meta.taxon}" }, + mode: params.publish_dir_mode + ] + } + withName: CUSTOM_DUMPSOFTWAREVERSIONS { + publishDir = [ + path: { "${params.tracedir}" }, + mode: 'copy', + pattern: '*_versions.yml' + ] + } + withName:'DIAMOND_BLASTP'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + ext.args = '--faster --evalue 1e-5 --max-hsps 1 --max-target-seqs 200 -b 6 --outfmt 6' + } + withName:'DIAMOND_MAKEDB'{ + storeDir = {"${params.nr_dmnd_dir}"} + } + withName:'DOWNLOAD_ACESSION2TAXID'{ + storeDir = {"${params.prot_accession2taxid_gz_dir}"} + } + withName:'DOWNLOAD_NR'{ + storeDir = {"${params.nr_dmnd_dir}"} + } + withName:'DOWNLOAD_TAXDUMP'{ + storeDir = {"${params.taxdump_tar_gz_dir}"} + } + withName:'EMBED_KMERS'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}/${meta.taxon}" }, + mode: params.publish_dir_mode + ] + } + withName:'GET_GENOMES_FOR_MOCK'{ + storeDir = { "${params.outdir}/${meta.id}/mock_data/genomes"} + ext.args = 'https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/assembly_summary_refseq.txt' + ext.args2 = 'GCF_000734955.1|GCF_900448115.1|GCF_015751765.1' + } + withName:'GTDB_MAKEDB'{ + storeDir = {"${params.gtdb_dir}"} + } + withName:'MAJORITY_VOTE'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + withName:'MARKERS'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + withName:'MOCK_DATA_REPORT'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + withName:'NORMALIZE_KMERS'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}/${meta.taxon}" }, + mode: params.publish_dir_mode + ] + } + withName:'PARSE_BED'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + withName:'PRODIGAL'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + ext.args = '-p meta -m' + } + withName:'REDUCE_LCA'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + withName:'SAMPLESHEET_CHECK'{ + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + withName:'SAMTOOLS_VIEW_AND_SORT'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + ext.args = '' + ext.args2 = '' + } + withName:'SEQKIT_FILTER'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + withName:'SPADES_KMER_COVERAGE'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode, + pattern: '*.coverages.tsv' + ] + } + withName:'SPLIT_KINGDOMS'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + withName:'RECRUIT'{ + publishDir = [ + path: { "${params.outdir}/${meta.id}/${task.process.tokenize(':')[-1].toLowerCase()}" }, + mode: params.publish_dir_mode + ] + } + withName:'TEST_DOWNLOAD'{ + storeDir = {"${params.prot_accession2taxid_gz_dir}"} + } +} + +/* + +'seqkit_split_options' { publish_by_meta = ['id'] - args = "" args2 = "--two-pass" } - 'spades_kmer_coverage' { - publish_by_meta = ['id'] - publish_files = ['*.coverages.tsv':''] - publish_dir = "coverage" - } - 'split_kingdoms_options' { - publish_by_meta = ['id'] - } - 'taxon_assignment' { - publish_by_meta = ['id'] - } - 'binning_options' { - publish_by_meta = ['id'] - publish_dir = "binning" - } - 'unclustered_recruitment_options' { - publish_by_meta = ['id'] - publish_dir = "unclustered_recruitment" - } - 'binning_summary_options' { - publish_by_meta = ['id'] - publish_dir = "binning_summary" - } - } -} + + + +*/ diff --git a/conf/test.config b/conf/test.config index 571e13bab..06dfca8fc 100644 --- a/conf/test.config +++ b/conf/test.config @@ -7,8 +7,49 @@ ======================================================================================== Defines input files and everything required to run a fast and simple pipeline test. - Use as follows: - nextflow run autometa -profile test, +example_dir="/tmp/autometa_test" +mkdir -p $example_dir $example_dir/database_directory $example_dir/output +cd $example_dir + +curl -L -H "Accept: application/vnd.github.v3.raw" https://github.com/KwanLab/autometa_test_data/raw/refs/heads/main/minimal/combined_nucleotide.fna.gz -o $example_dir/combined_nucleotide.fna.gz +curl -L -H "Accept: application/vnd.github.v3.raw" https://github.com/KwanLab/autometa_test_data/raw/refs/heads/main/minimal/reads_1.fastq.gz -o $example_dir/reads_1.fastq.gz +curl -L -H "Accept: application/vnd.github.v3.raw" https://github.com/KwanLab/autometa_test_data/raw/refs/heads/main/minimal/reads_2.fastq.gz -o $example_dir/reads_2.fastq.gz +curl -L -H "Accept: application/vnd.github.v3.raw" https://github.com/KwanLab/autometa_test_data/raw/refs/heads/main/minimal/database_directory/prot.accession2taxid.gz -o $example_dir/database_directory/prot.accession2taxid.gz +curl -L -H "Accept: application/vnd.github.v3.raw" https://github.com/KwanLab/autometa_test_data/raw/refs/heads/main/minimal/database_directory/nr.dmnd -o $example_dir/database_directory/nr.dmnd + +# Create a sample sheet +sample_sheet="$example_dir/autometa_test_samplesheet.csv" +echo "sample,assembly,fastq_1,fastq_2,coverage_tab,cov_from_assembly" > $sample_sheet +echo "example_1,${example_dir}/combined_nucleotide.fna.gz,${example_dir}/reads_1.fastq.gz,${example_dir}/reads_2.fastq.gz,,0" >> $sample_sheet + +cd ~/Autometa + + nextflow run KwanLab/Autometa \ + -profile docker \ + --input $sample_sheet \ + --taxonomy_aware \ + --outdir ${example_dir}/output \ + --single_db_dir $example_dir/database_directory \ + --autometa_image_tag 'dev' \ + --use_gtdb \ + --gtdb_version '220' \ + --large_downloads_permission + -resume + +# or + + nextflow run KwanLab/Autometa \ + -profile docker \ + --input $sample_sheet \ + --taxonomy_aware \ + --outdir ${example_dir}/output \ + --single_db_dir $example_dir/database_directory \ + --autometa_image_tag 'dev' \ + --large_downloads_permission \ + --max_memory '900.GB' \ + --max_cpus 90 \ + --max_time '20040.h' \ + -resume ---------------------------------------------------------------------------------------- */ @@ -22,10 +63,7 @@ params { max_memory = 6.GB max_time = 2.h - // Input data - // Specify the paths to your test data on nf-core/test-datasets - // Give any required params for the test so that command line flags are not needed - input = 'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/015/645/455/GCF_015645455.1_ASM1564545v1/GCF_015645455.1_ASM1564545v1_genomic.fna.gz' mock_test = true + debug = true } diff --git a/docker/modules/mock_data_reporter.Dockerfile b/docker/modules/mock_data_reporter.Dockerfile index 00e7b3b03..b72d322aa 100644 --- a/docker/modules/mock_data_reporter.Dockerfile +++ b/docker/modules/mock_data_reporter.Dockerfile @@ -1,4 +1,4 @@ -FROM rocker/rstudio:4.1.2 +FROM rocker/rstudio:4.2.2 # Not starting from r-base b/c pandoc, etc needed LABEL maintainer="jason.kwan@wisc.edu" @@ -17,9 +17,12 @@ RUN apt-get update -qq && apt-get -y --no-install-recommends install \ libnetcdf-dev \ udunits-bin \ libudunits2-dev \ - curl + curl \ + procps # R packages -ENV R_PACKAGES='c("ggbeeswarm","data.table","plotly","crosstalk","DT","patchwork")' -RUN echo 'options("repos"="https://mran.microsoft.com/snapshot/2022-01-19")' >> /usr/local/lib/R/etc/Rprofile.site +ENV R_PACKAGES='c("rmarkdown", "data.table", "ggplot2", "plotly", "crosstalk", "magrittr", "DT", "ggbeeswarm", "patchwork", "htmltools")' + +# MRAN is going away. TODO: find a suitable replacement or snaphshot with renv or just cross fingers +# RUN echo 'options("repos"="https://mran.microsoft.com/snapshot/2023-03-03")' >> /usr/local/lib/R/etc/Rprofile.site RUN Rscript -e "install.packages(${R_PACKAGES}, Ncpus=parallel::detectCores())" diff --git a/docs/source/nextflow-workflow.rst b/docs/source/nextflow-workflow.rst index ad5790e2f..7dd5f210b 100644 --- a/docs/source/nextflow-workflow.rst +++ b/docs/source/nextflow-workflow.rst @@ -5,6 +5,72 @@ ======================= +Ultra-Quick Start +############# + +If you already have Nextflow and Docker installed the following commands will get you started. For detailed instructions see the sections following this. + + +.. code-block:: bash + + # change this to your desired directories + example_dir="/tmp/autometa_test" + single_database_dir="/tmp/autometa_test/database_directory" + + # make the needed subdirectories + mkdir -p $example_dir $example_dir/output + cd $example_dir + + # download small example data + curl -L -H "Accept: application/vnd.github.v3.raw" https://github.com/KwanLab/autometa_test_data/raw/refs/heads/main/minimal/combined_nucleotide.fna.gz -o $example_dir/combined_nucleotide.fna.gz + curl -L -H "Accept: application/vnd.github.v3.raw" https://github.com/KwanLab/autometa_test_data/raw/refs/heads/main/minimal/reads_1.fastq.gz -o $example_dir/reads_1.fastq.gz + curl -L -H "Accept: application/vnd.github.v3.raw" https://github.com/KwanLab/autometa_test_data/raw/refs/heads/main/minimal/reads_2.fastq.gz -o $example_dir/reads_2.fastq.gz + curl -L -H "Accept: application/vnd.github.v3.raw" https://github.com/KwanLab/autometa_test_data/raw/refs/heads/main/minimal/database_directory/prot.accession2taxid.gz -o $example_dir/database_directory/prot.accession2taxid.gz + curl -L -H "Accept: application/vnd.github.v3.raw" https://github.com/KwanLab/autometa_test_data/raw/refs/heads/main/minimal/database_directory/nr.dmnd -o $example_dir/database_directory/nr.dmnd + + # Create a sample sheet + sample_sheet="$example_dir/autometa_test_samplesheet.csv" + echo "sample,assembly,fastq_1,fastq_2,coverage_tab,cov_from_assembly" > $sample_sheet + echo "example_1,${example_dir}/combined_nucleotide.fna.gz,${example_dir}/reads_1.fastq.gz,${example_dir}/reads_2.fastq.gz,,0" >> $sample_sheet + + # Run Autometa without taxon splitting + nextflow run KwanLab/Autometa \ + -profile docker \ + --input $sample_sheet \ + --outdir ${example_dir}/output \ + --max_memory '16.GB' \ + --max_cpus 9 \ + --max_time '8.h' + + # Or use NCBI nr to split contigs by taxonomy + nextflow run KwanLab/Autometa \ + -profile docker \ + --input $sample_sheet \ + --taxonomy_aware \ + --outdir ${example_dir}/output \ + --single_db_dir ${single_database_dir} \ + --autometa_image_tag 'dev' \ + --large_downloads_permission \ + --max_memory '16.GB' \ + --max_cpus 9 \ + --max_time '8.h' + + # Or with GTDB refinement + nextflow run KwanLab/Autometa \ + -profile docker \ + --input $sample_sheet \ + --taxonomy_aware \ + --outdir ${example_dir}/output_gtdb \ + --single_db_dir ${single_database_dir} \ + --autometa_image_tag 'dev' \ + --use_gtdb \ + --gtdb_version '220' \ + --gtdb_dir ${single_database_dir} \ + --large_downloads_permission \ + --max_memory '16.GB' \ + --max_cpus 9 \ + --max_time '8.h' + Why nextflow? ############# @@ -227,12 +293,6 @@ Then copy the following code block into that new file ("agrp" is the slurm parti slurm { process.executor = "slurm" process.queue = "agrp" // <<-- change this to whatever your partition is called - docker.enabled = true - docker.userEmulation = true - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false executor { queueSize = 8 } @@ -609,7 +669,7 @@ may still use multiple cores. Databases ********* -Autometa uses the following NCBI databases throughout its pipeline: +When the Autometa workflow is run with the `--taxonomy_aware` flag it will use NCBI nr databases to help bin contigs. If the databases aren't present and you include the `--large_downloads_permission` flag, the workflow will download and format the following databases: - Non-redundant nr database - `ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz `_ @@ -618,18 +678,27 @@ Autometa uses the following NCBI databases throughout its pipeline: - nodes.dmp, names.dmp and merged.dmp - Found within - `ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdump.tar.gz `_ -If you are running autometa for the first time you'll have to download these databases. -You may use ``autometa-update-databases --update-ncbi``. This will download the databases to the default path. You can check -the default paths using ``autometa-config --print``. If you need to change the default download directory you can use -``autometa-config --section databases --option ncbi --value ``. -See ``autometa-update-databases -h`` and ``autometa-config -h`` for full list of options. +Additionally, the NCBI-taxonomy based taxa assignments can be refined using GTDB. To do so you must use the flag `--use_gtdb` and, optionally, the version of GTDB you would like to use with the `--gtdb_version` flag. + +If the `--large_downloads_permission` is provided the workflow will handle the downloading and formatting of the following files; and you should let it because it isn't straightforward to do manually. + +- GTDB taxdump + - `https://github.com/shenwei356/gtdb-taxdump/releases `_ +- GTDB database + - e.g. `https://data.gtdb.ecogenomic.org/releases/release220/220.0/genomic_files_reps/gtdb_proteins_aa_reps_r220.tar.gz `_ In your ``nf-params.json`` file you also need to specify the directory where the different databases are present. -Make sure that the directory path contains the following databases: -- Diamond formatted nr file => nr.dmnd -- Extracted files from tarball taxdump.tar.gz -- prot.accession2taxid.gz +The easiest method is to just set `--single_db_dir` to the directory where all the databases will stored and let the workflow handle the rest. +If you want finer control you can direct the workflow to specific database directories using the following parameters: + +- `--nr_dmnd_dir` +- `--lca_dir` +- `--prot_accession2taxid_gz_dir` +- `--taxdump_tar_gz_dir` +- `--gtdb_dir` + + .. code-block:: diff --git a/lib/NfcoreSchema.groovy b/lib/NfcoreSchema.groovy index 40ab65f20..b3d092f80 100755 --- a/lib/NfcoreSchema.groovy +++ b/lib/NfcoreSchema.groovy @@ -27,7 +27,7 @@ class NfcoreSchema { /* groovylint-disable-next-line UnusedPrivateMethodParameter */ public static void validateParameters(workflow, params, log, schema_filename='nextflow_schema.json') { def has_error = false - //=====================================================================// + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Check for nextflow core params and unexpected params def json = new File(getSchemaPath(workflow, schema_filename=schema_filename)).text def Map schemaParams = (Map) new JsonSlurper().parseText(json).get('definitions') @@ -135,7 +135,7 @@ class NfcoreSchema { } } - //=====================================================================// + //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~// // Validate parameters against the schema InputStream input_stream = new File(getSchemaPath(workflow, schema_filename=schema_filename)).newInputStream() JSONObject raw_schema = new JSONObject(new JSONTokener(input_stream)) diff --git a/lib/NfcoreTemplate.groovy b/lib/NfcoreTemplate.groovy deleted file mode 100755 index 44551e0a3..000000000 --- a/lib/NfcoreTemplate.groovy +++ /dev/null @@ -1,270 +0,0 @@ -// -// This file holds several functions used within the nf-core pipeline template. -// - -import org.yaml.snakeyaml.Yaml - -class NfcoreTemplate { - - // - // Check AWS Batch related parameters have been specified correctly - // - public static void awsBatch(workflow, params) { - if (workflow.profile.contains('awsbatch')) { - // Check params.awsqueue and params.awsregion have been set if running on AWSBatch - assert (params.awsqueue && params.awsregion) : "Specify correct --awsqueue and --awsregion parameters on AWSBatch!" - // Check outdir paths to be S3 buckets if running on AWSBatch - assert params.outdir.startsWith('s3:') : "Outdir not on S3 - specify S3 Bucket to run on AWSBatch!" - } - } - - // - // Check params.hostnames - // - public static void hostName(workflow, params, log) { - Map colors = logColours(params.monochrome_logs) - if (params.hostnames) { - try { - def hostname = "hostname".execute().text.trim() - params.hostnames.each { prof, hnames -> - hnames.each { hname -> - if (hostname.contains(hname) && !workflow.profile.contains(prof)) { - log.info "=${colors.yellow}====================================================${colors.reset}=\n" + - "${colors.yellow}WARN: You are running with `-profile $workflow.profile`\n" + - " but your machine hostname is ${colors.white}'$hostname'${colors.reset}.\n" + - " ${colors.yellow_bold}Please use `-profile $prof${colors.reset}`\n" + - "=${colors.yellow}====================================================${colors.reset}=" - } - } - } - } catch (Exception e) { - log.warn "[$workflow.manifest.name] Could not determine 'hostname' - skipping check. Reason: ${e.message}." - } - } - } - - // - // Construct and send completion email - // - public static void email(workflow, params, summary_params, projectDir, log, multiqc_report=[]) { - - // Set up the e-mail variables - def subject = "[$workflow.manifest.name] Successful: $workflow.runName" - if (!workflow.success) { - subject = "[$workflow.manifest.name] FAILED: $workflow.runName" - } - - def summary = [:] - for (group in summary_params.keySet()) { - summary << summary_params[group] - } - - def misc_fields = [:] - misc_fields['Date Started'] = workflow.start - misc_fields['Date Completed'] = workflow.complete - misc_fields['Pipeline script file path'] = workflow.scriptFile - misc_fields['Pipeline script hash ID'] = workflow.scriptId - if (workflow.repository) misc_fields['Pipeline repository Git URL'] = workflow.repository - if (workflow.commitId) misc_fields['Pipeline repository Git Commit'] = workflow.commitId - if (workflow.revision) misc_fields['Pipeline Git branch/tag'] = workflow.revision - misc_fields['Nextflow Version'] = workflow.nextflow.version - misc_fields['Nextflow Build'] = workflow.nextflow.build - misc_fields['Nextflow Compile Timestamp'] = workflow.nextflow.timestamp - - def email_fields = [:] - email_fields['version'] = workflow.manifest.version - email_fields['runName'] = workflow.runName - email_fields['success'] = workflow.success - email_fields['dateComplete'] = workflow.complete - email_fields['duration'] = workflow.duration - email_fields['exitStatus'] = workflow.exitStatus - email_fields['errorMessage'] = (workflow.errorMessage ?: 'None') - email_fields['errorReport'] = (workflow.errorReport ?: 'None') - email_fields['commandLine'] = workflow.commandLine - email_fields['projectDir'] = workflow.projectDir - email_fields['summary'] = summary << misc_fields - - // On success try attach the multiqc report - def mqc_report = null - try { - if (workflow.success) { - mqc_report = multiqc_report.getVal() - if (mqc_report.getClass() == ArrayList && mqc_report.size() >= 1) { - if (mqc_report.size() > 1) { - log.warn "[$workflow.manifest.name] Found multiple reports from process 'MULTIQC', will use only one" - } - mqc_report = mqc_report[0] - } - } - } catch (all) { - if (multiqc_report) { - log.warn "[$workflow.manifest.name] Could not attach MultiQC report to summary email" - } - } - - // Check if we are only sending emails on failure - def email_address = params.email - if (!params.email && params.email_on_fail && !workflow.success) { - email_address = params.email_on_fail - } - - // Render the TXT template - def engine = new groovy.text.GStringTemplateEngine() - def tf = new File("$projectDir/assets/email_template.txt") - def txt_template = engine.createTemplate(tf).make(email_fields) - def email_txt = txt_template.toString() - - // Render the HTML template - def hf = new File("$projectDir/assets/email_template.html") - def html_template = engine.createTemplate(hf).make(email_fields) - def email_html = html_template.toString() - - // Render the sendmail template - def max_multiqc_email_size = params.max_multiqc_email_size as nextflow.util.MemoryUnit - def smail_fields = [ email: email_address, subject: subject, email_txt: email_txt, email_html: email_html, projectDir: "$projectDir", mqcFile: mqc_report, mqcMaxSize: max_multiqc_email_size.toBytes() ] - def sf = new File("$projectDir/assets/sendmail_template.txt") - def sendmail_template = engine.createTemplate(sf).make(smail_fields) - def sendmail_html = sendmail_template.toString() - - // Send the HTML e-mail - Map colors = logColours(params.monochrome_logs) - if (email_address) { - try { - if (params.plaintext_email) { throw GroovyException('Send plaintext e-mail, not HTML') } - // Try to send HTML e-mail using sendmail - [ 'sendmail', '-t' ].execute() << sendmail_html - log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (sendmail)-" - } catch (all) { - // Catch failures and try with plaintext - def mail_cmd = [ 'mail', '-s', subject, '--content-type=text/html', email_address ] - if ( mqc_report.size() <= max_multiqc_email_size.toBytes() ) { - mail_cmd += [ '-A', mqc_report ] - } - mail_cmd.execute() << email_html - log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Sent summary e-mail to $email_address (mail)-" - } - } - - // Write summary e-mail HTML to a file - def output_d = new File("${params.outdir}/pipeline_info/") - if (!output_d.exists()) { - output_d.mkdirs() - } - def output_hf = new File(output_d, "pipeline_report.html") - output_hf.withWriter { w -> w << email_html } - def output_tf = new File(output_d, "pipeline_report.txt") - output_tf.withWriter { w -> w << email_txt } - } - - // - // Print pipeline summary on completion - // - public static void summary(workflow, params, log) { - Map colors = logColours(params.monochrome_logs) - if (workflow.success) { - if (workflow.stats.ignoredCount == 0) { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.green} Pipeline completed successfully${colors.reset}-" - } else { - log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed successfully, but with errored process(es) ${colors.reset}-" - } - } else { - hostName(workflow, params, log) - log.info "-${colors.purple}[$workflow.manifest.name]${colors.red} Pipeline completed with errors${colors.reset}-" - } - } - - // - // ANSII Colours used for terminal logging - // - public static Map logColours(Boolean monochrome_logs) { - Map colorcodes = [:] - - // Reset / Meta - colorcodes['reset'] = monochrome_logs ? '' : "\033[0m" - colorcodes['bold'] = monochrome_logs ? '' : "\033[1m" - colorcodes['dim'] = monochrome_logs ? '' : "\033[2m" - colorcodes['underlined'] = monochrome_logs ? '' : "\033[4m" - colorcodes['blink'] = monochrome_logs ? '' : "\033[5m" - colorcodes['reverse'] = monochrome_logs ? '' : "\033[7m" - colorcodes['hidden'] = monochrome_logs ? '' : "\033[8m" - - // Regular Colors - colorcodes['black'] = monochrome_logs ? '' : "\033[0;30m" - colorcodes['red'] = monochrome_logs ? '' : "\033[0;31m" - colorcodes['green'] = monochrome_logs ? '' : "\033[0;32m" - colorcodes['yellow'] = monochrome_logs ? '' : "\033[0;33m" - colorcodes['blue'] = monochrome_logs ? '' : "\033[0;34m" - colorcodes['purple'] = monochrome_logs ? '' : "\033[0;35m" - colorcodes['cyan'] = monochrome_logs ? '' : "\033[0;36m" - colorcodes['white'] = monochrome_logs ? '' : "\033[0;37m" - - // Bold - colorcodes['bblack'] = monochrome_logs ? '' : "\033[1;30m" - colorcodes['bred'] = monochrome_logs ? '' : "\033[1;31m" - colorcodes['bgreen'] = monochrome_logs ? '' : "\033[1;32m" - colorcodes['byellow'] = monochrome_logs ? '' : "\033[1;33m" - colorcodes['bblue'] = monochrome_logs ? '' : "\033[1;34m" - colorcodes['bpurple'] = monochrome_logs ? '' : "\033[1;35m" - colorcodes['bcyan'] = monochrome_logs ? '' : "\033[1;36m" - colorcodes['bwhite'] = monochrome_logs ? '' : "\033[1;37m" - - // Underline - colorcodes['ublack'] = monochrome_logs ? '' : "\033[4;30m" - colorcodes['ured'] = monochrome_logs ? '' : "\033[4;31m" - colorcodes['ugreen'] = monochrome_logs ? '' : "\033[4;32m" - colorcodes['uyellow'] = monochrome_logs ? '' : "\033[4;33m" - colorcodes['ublue'] = monochrome_logs ? '' : "\033[4;34m" - colorcodes['upurple'] = monochrome_logs ? '' : "\033[4;35m" - colorcodes['ucyan'] = monochrome_logs ? '' : "\033[4;36m" - colorcodes['uwhite'] = monochrome_logs ? '' : "\033[4;37m" - - // High Intensity - colorcodes['iblack'] = monochrome_logs ? '' : "\033[0;90m" - colorcodes['ired'] = monochrome_logs ? '' : "\033[0;91m" - colorcodes['igreen'] = monochrome_logs ? '' : "\033[0;92m" - colorcodes['iyellow'] = monochrome_logs ? '' : "\033[0;93m" - colorcodes['iblue'] = monochrome_logs ? '' : "\033[0;94m" - colorcodes['ipurple'] = monochrome_logs ? '' : "\033[0;95m" - colorcodes['icyan'] = monochrome_logs ? '' : "\033[0;96m" - colorcodes['iwhite'] = monochrome_logs ? '' : "\033[0;97m" - - // Bold High Intensity - colorcodes['biblack'] = monochrome_logs ? '' : "\033[1;90m" - colorcodes['bired'] = monochrome_logs ? '' : "\033[1;91m" - colorcodes['bigreen'] = monochrome_logs ? '' : "\033[1;92m" - colorcodes['biyellow'] = monochrome_logs ? '' : "\033[1;93m" - colorcodes['biblue'] = monochrome_logs ? '' : "\033[1;94m" - colorcodes['bipurple'] = monochrome_logs ? '' : "\033[1;95m" - colorcodes['bicyan'] = monochrome_logs ? '' : "\033[1;96m" - colorcodes['biwhite'] = monochrome_logs ? '' : "\033[1;97m" - - return colorcodes - } - - // - // Does what is says on the tin - // - public static String dashedLine(monochrome_logs) { - Map colors = logColours(monochrome_logs) - return "-${colors.dim}----------------------------------------------------${colors.reset}-" - } - - // - // nf-core logo - // - public static String logo(workflow, monochrome_logs) { - Map colors = logColours(monochrome_logs) - String.format( - """\n - ${dashedLine(monochrome_logs)} - ${colors.green},--.${colors.black}/${colors.green},-.${colors.reset} - ${colors.blue} ___ __ __ __ ___ ${colors.green}/,-._.--~\'${colors.reset} - ${colors.blue} |\\ | |__ __ / ` / \\ |__) |__ ${colors.yellow}} {${colors.reset} - ${colors.blue} | \\| | \\__, \\__/ | \\ |___ ${colors.green}\\`-._,-`-,${colors.reset} - ${colors.green}`._,._,\'${colors.reset} - ${colors.purple} ${workflow.manifest.name} v${workflow.manifest.version}${colors.reset} - ${dashedLine(monochrome_logs)} - """.stripIndent() - ) - } -} diff --git a/lib/Utils.groovy b/lib/Utils.groovy deleted file mode 100755 index 18173e985..000000000 --- a/lib/Utils.groovy +++ /dev/null @@ -1,47 +0,0 @@ -// -// This file holds several Groovy functions that could be useful for any Nextflow pipeline -// - -import org.yaml.snakeyaml.Yaml - -class Utils { - - // - // When running with -profile conda, warn if channels have not been set-up appropriately - // - public static void checkCondaChannels(log) { - Yaml parser = new Yaml() - def channels = [] - try { - def config = parser.load("conda config --show channels".execute().text) - channels = config.channels - } catch(NullPointerException | IOException e) { - log.warn "Could not verify conda channel configuration." - return - } - - // Check that all channels are present - def required_channels = ['conda-forge', 'bioconda', 'defaults'] - def conda_check_failed = !required_channels.every { ch -> ch in channels } - - // Check that they are in the right order - conda_check_failed |= !(channels.indexOf('conda-forge') < channels.indexOf('bioconda')) - conda_check_failed |= !(channels.indexOf('bioconda') < channels.indexOf('defaults')) - - if (conda_check_failed) { - log.warn "=============================================================================\n" + - " There is a problem with your Conda configuration!\n\n" + - " You will need to set-up the conda-forge and bioconda channels correctly.\n" + - " Please refer to https://bioconda.github.io/user/install.html#set-up-channels\n" + - " NB: The order of the channels matters!\n" + - "===================================================================================" - } - } - - // - // Join module args with appropriate spacing - // - public static String joinModuleArgs(args_list) { - return ' ' + args_list.join(' ') - } -} diff --git a/lib/WorkflowAutometa.groovy b/lib/WorkflowAutometa.groovy deleted file mode 100755 index e66120fa0..000000000 --- a/lib/WorkflowAutometa.groovy +++ /dev/null @@ -1,59 +0,0 @@ -// -// This file holds several functions specific to the workflow/autometa.nf in the nf-core/autometa pipeline -// - -class WorkflowAutometa { - - // - // Check and validate parameters - // - public static void initialise(params, log) { - genomeExistsError(params, log) - - if (!params.fasta) { - log.error "Genome fasta file not specified with e.g. '--fasta genome.fa' or via a detectable config file." - System.exit(1) - } - } - - // - // Get workflow summary for MultiQC - // - public static String paramsSummaryMultiqc(workflow, summary) { - String summary_section = '' - for (group in summary.keySet()) { - def group_params = summary.get(group) // This gets the parameters of that particular group - if (group_params) { - summary_section += " $group\n" - summary_section += " \n" - for (param in group_params.keySet()) { - summary_section += " $param${group_params.get(param) ?: 'N/A'}\n" - } - summary_section += " \n" - } - } - - String yaml_file_text = "id: '${workflow.manifest.name.replace('/','-')}-summary'\n" - yaml_file_text += "description: ' - this information is collected when the pipeline is started.'\n" - yaml_file_text += "section_name: '${workflow.manifest.name} Workflow Summary'\n" - yaml_file_text += "section_href: 'https://github.com/${workflow.manifest.name}'\n" - yaml_file_text += "plot_type: 'html'\n" - yaml_file_text += "data: |\n" - yaml_file_text += "${summary_section}" - return yaml_file_text - } - - // - // Exit pipeline if incorrect --genome key provided - // - private static void genomeExistsError(params, log) { - if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { - log.error "=============================================================================\n" + - " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" + - " Currently, the available genome keys are:\n" + - " ${params.genomes.keySet().join(", ")}\n" + - "===================================================================================" - System.exit(1) - } - } -} diff --git a/lib/WorkflowMain.groovy b/lib/WorkflowMain.groovy deleted file mode 100755 index e34547e0a..000000000 --- a/lib/WorkflowMain.groovy +++ /dev/null @@ -1,94 +0,0 @@ -// -// This file holds several functions specific to the main.nf workflow in the nf-core/autometa pipeline -// - -class WorkflowMain { - - // - // Citation string for pipeline - // - public static String citation(workflow) { - return "If you use ${workflow.manifest.name} for your analysis please cite:\n\n" + - // TODO nf-core: Add Zenodo DOI for pipeline after first release - //"* The pipeline\n" + - //" https://doi.org/10.5281/zenodo.XXXXXXX\n\n" + - "* The nf-core framework\n" + - " https://doi.org/10.1038/s41587-020-0439-x\n\n" + - "* Software dependencies\n" + - " https://github.com/${workflow.manifest.name}/blob/master/CITATIONS.md" - } - - // - // Print help to screen if required - // - public static String help(workflow, params, log) { - def command = "nf-core launch KwanLab/Autometa" - def help_string = '' - help_string += NfcoreTemplate.logo(workflow, params.monochrome_logs) - help_string += NfcoreSchema.paramsHelp(workflow, params, command) - help_string += '\n' + citation(workflow) + '\n' - help_string += NfcoreTemplate.dashedLine(params.monochrome_logs) - return help_string - } - - // - // Print parameter summary log to screen - // - public static String paramsSummaryLog(workflow, params, log) { - def summary_log = '' - summary_log += NfcoreTemplate.logo(workflow, params.monochrome_logs) - summary_log += NfcoreSchema.paramsSummaryLog(workflow, params) - summary_log += '\n' + citation(workflow) + '\n' - summary_log += NfcoreTemplate.dashedLine(params.monochrome_logs) - return summary_log - } - - // - // Validate parameters and print summary to screen - // - public static void initialise(workflow, params, log) { - // Print help to screen if required - if (params.help) { - log.info help(workflow, params, log) - System.exit(0) - } - - // Validate workflow parameters via the JSON schema - if (params.validate_params) { - NfcoreSchema.validateParameters(workflow, params, log) - } - - // Print parameter summary log to screen - log.info paramsSummaryLog(workflow, params, log) - - // Check that conda channels are set-up correctly - if (params.enable_conda) { - Utils.checkCondaChannels(log) - } - - // Check AWS batch settings - NfcoreTemplate.awsBatch(workflow, params) - - // Check the hostnames against configured profiles - NfcoreTemplate.hostName(workflow, params, log) - - // Check input has been provided - if (!params.input) { - log.error "Please provide an input samplesheet to the pipeline e.g. '--input samplesheet.csv'" - System.exit(1) - } - } - - // - // Get attribute from genome config file e.g. fasta - // - public static String getGenomeAttribute(params, attribute) { - def val = '' - if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) { - if (params.genomes[ params.genome ].containsKey(attribute)) { - val = params.genomes[ params.genome ][ attribute ] - } - } - return val - } -} diff --git a/main.nf b/main.nf index c35c35414..24977c4f0 100644 --- a/main.nf +++ b/main.nf @@ -19,7 +19,6 @@ nextflow.enable.dsl = 2 ======================================================================================== */ -WorkflowMain.initialise(workflow, params, log) //////////////////////////////////////////////////// @@ -41,7 +40,7 @@ Results directory: ${params.outdir} ======================================================================================== */ -include { AUTOMETA } from './workflows/autometa.nf' addParams(single_db_dir: params.single_db_dir) +include { AUTOMETA } from './workflows/autometa.nf' /* ======================================================================================== diff --git a/modules.json b/modules.json index 229d1b13c..711e99009 100644 --- a/modules.json +++ b/modules.json @@ -1,13 +1,26 @@ { - "name": "nf-core/autometa", - "homePage": "https://github.com/nf-core/autometa", + "name": "autometa", + "homePage": "https://github.com/KwanLab/Autometa", "repos": { - "nf-core/modules": { - "bowtie2/align": { - "git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d" - }, - "prodigal": { - "git_sha": "e937c7950af70930d1f34bb961403d9d2aa81c7d" + "https://github.com/nf-core/modules.git": { + "modules": { + "nf-core": { + "bowtie2/align": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + }, + "prodigal": { + "branch": "master", + "git_sha": "c8e35eb2055c099720a75538d1b8adb3fb5a464c", + "installed_by": ["modules"] + } + } } } } diff --git a/modules/local/align_reads.nf b/modules/local/align_reads.nf index 472a8a25f..64b20bd96 100644 --- a/modules/local/align_reads.nf +++ b/modules/local/align_reads.nf @@ -1,19 +1,11 @@ #!/usr/bin/env nextflow -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process ALIGN_READS { tag "Aligning reads to ${meta.id}" label 'process_high' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "bioconda::autometa" : null) + conda "bioconda::autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -24,28 +16,34 @@ process ALIGN_READS { tuple val(meta), path(metagenome), path(fwd_reads), path(rev_reads) output: - tuple val(meta), path("alignments.sam"), emit: sam - path "*.db*.bt2" , emit: bt2_db - path "*.version.txt" , emit: version + tuple val(meta), path("*.alignments.sam") , emit: sam + path "*.db*.bt2" , emit: bt2_db + path "versions.yml" , emit: versions when: - meta.cov_from_assembly.equals('0') + task.ext.when == null || task.ext.when script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" """ bowtie2-build \\ - ${options.args} \\ + ${args} \\ ${metagenome} \\ - ${meta.id}.db + ${prefix}.db bowtie2 \\ -x ${meta.id}.db \\ - ${options.args2} \\ + ${args2} \\ -p ${task.cpus} \\ - -S alignments.sam \\ + -S ${prefix}.alignments.sam \\ -1 $fwd_reads \\ -2 $rev_reads - echo \$(bowtie2 --version 2>&1) | sed -n 's/^.*bowtie2-align-s version //p; s/ .*\$//' > bowtie2.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + END_VERSIONS """ } diff --git a/modules/local/analyze_kmers.nf b/modules/local/analyze_kmers.nf deleted file mode 100644 index 8de91e294..000000000 --- a/modules/local/analyze_kmers.nf +++ /dev/null @@ -1,46 +0,0 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - -process ANALYZE_KMERS { - tag "Counting kmers for ${meta.id}" - label 'process_medium' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "autometa" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/autometa" - } else { - container "jasonkwan/autometa:${params.autometa_image_tag}" - } - - input: - tuple val(meta), path(metagenome) - - output: - tuple val(meta), path("kmers.tsv") , emit: counts - tuple val(meta), path("kmers.normalized.tsv"), emit: normalized - tuple val(meta), path("kmers.embedded.tsv") , emit: embedded - path '*.version.txt' , emit: version - - script: - def software = getSoftwareName(task.process) - """ - autometa-kmers \\ - --fasta ${metagenome} \\ - --kmers "kmers.tsv" \\ - --size "${params.kmer_size}" \\ - --norm-output "kmers.normalized.tsv" \\ - --norm-method "${params.norm_method}" \\ - --pca-dimensions "${params.pca_dimensions}" \\ - --embedding-output "kmers.embedded.tsv" \\ - --embedding-method "${params.embedding_method}" \\ - --embedding-dimensions "${params.embedding_dimensions}" \\ - --cpus "${task.cpus}" \\ - --seed 42 - - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt - """ -} diff --git a/modules/local/bedtools_genomecov.nf b/modules/local/bedtools_genomecov.nf index 64e2241a0..cbfd55cf1 100644 --- a/modules/local/bedtools_genomecov.nf +++ b/modules/local/bedtools_genomecov.nf @@ -1,16 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process BEDTOOLS_GENOMECOV { tag "${meta.id}" label 'process_medium' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "bioconda::bedtools=2.30.0" : null) + conda "bioconda::bedtools=2.30.0" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/bedtools:2.30.0--hc088bd4_0" } else { @@ -21,20 +13,25 @@ process BEDTOOLS_GENOMECOV { tuple val(meta), path(bam) output: - tuple val(meta), path("alignments.bed"), emit: bed - path "*.version.txt" , emit: version + tuple val(meta), path("*alignments.bed"), emit: bed + path "versions.yml" , emit: versions when: meta.cov_from_assembly.equals('0') + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" """ bedtools \\ genomecov \\ -ibam ${bam} \\ - $options.args > alignments.bed + ${args} > ${prefix}.alignments.bed - bedtools --version | sed -e "s/bedtools v//g" > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bedtools: \$(bedtools --version | sed -e "s/bedtools v//g") + END_VERSIONS """ } diff --git a/modules/local/binning.nf b/modules/local/binning.nf index 877a65014..977d81de8 100644 --- a/modules/local/binning.nf +++ b/modules/local/binning.nf @@ -1,15 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process BINNING { tag "sample:${meta.id}, clustering:${params.clustering_method}, completeness:${params.completeness}, purity:${params.purity}, cov.std.dev.:${params.cov_stddev_limit}, gc.std.dev.:${params.gc_stddev_limit}" label 'process_high' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - conda (params.enable_conda ? "bioconda::autometa" : null) + conda "bioconda::autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -20,24 +13,28 @@ process BINNING { errorStrategy { task.exitStatus in 204 ? 'ignore' : 'terminate' } input: - tuple val(meta), path(kmers), path(coverage), path(gc_content), path(markers), path(taxonomy) + tuple val(meta), path(kmers), path(markers), path(coverage), path(gc_content), path(taxonomy) output: - tuple val(meta), path("${params.kingdom}.binning.tsv.gz") , emit: binning - tuple val(meta), path("${params.kingdom}.binning.main.tsv.gz"), emit: main - path '*.version.txt' , emit: version + tuple val(meta), path("*.binning.tsv.gz") , emit: binning + tuple val(meta), path("*.binning.main.tsv.gz") , emit: main + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) taxonomy_call = params.taxonomy_aware ? "--taxonomy $taxonomy" : "" // https://github.com/nextflow-io/nextflow/issues/1694#issuecomment-683272275 + def prefix = task.ext.prefix ?: "${meta.id}" + def taxon = meta.taxon ?: "${meta.taxon}" """ autometa-binning \\ --kmers $kmers \\ --coverages $coverage \\ --gc-content $gc_content \\ --markers $markers \\ - --output-binning ${params.kingdom}.binning.tsv.gz \\ - --output-main ${params.kingdom}.binning.main.tsv.gz \\ + --output-binning ${prefix}.${taxon}.binning.tsv.gz \\ + --output-main ${prefix}.${taxon}.binning.main.tsv.gz \\ --clustering-method ${params.clustering_method} \\ --completeness ${params.completeness} \\ --purity ${params.purity} \\ @@ -47,8 +44,12 @@ process BINNING { --starting-rank ${params.binning_starting_rank} \\ --cpus ${task.cpus} \\ --rank-filter superkingdom \\ - --rank-name-filter ${params.kingdom} + --rank-name-filter ${taxon} \\ + --verbose - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/binning_summary.nf b/modules/local/binning_summary.nf index f0c8010db..db5dbeeda 100644 --- a/modules/local/binning_summary.nf +++ b/modules/local/binning_summary.nf @@ -1,16 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process BINNING_SUMMARY { tag "Gathering binning summary for ${meta.id}" label 'process_high' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "bioconda::autometa" : null) + conda "bioconda::autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -18,30 +10,34 @@ process BINNING_SUMMARY { } input: - tuple val(meta), path(binning_main), path(markers), path(metagenome) - val(binning_column) - path(ncbi) + tuple val(meta), path(binning_main), path(markers), path(metagenome), path(taxdump_files), val(dbtype), val(binning_column) output: - tuple val(meta), path("metabin_stats.tsv") , emit: stats - tuple val(meta), path("metabins") , emit: metabins - tuple val(meta), path("metabin_taxonomy.tsv"), emit: taxonomies, optional: true - path '*.version.txt' , emit: version + tuple val(meta), path("*metabin_stats.tsv") , emit: stats + tuple val(meta), path("*metabins") , emit: metabins + tuple val(meta), path("*metabin_taxonomy.tsv") , emit: taxonomies, optional: true + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def prefix = task.ext.prefix ?: "${meta.id}" """ autometa-binning-summary \\ - --dbdir $ncbi \\ - --dbtype ncbi \\ + --dbdir . \\ + --dbtype ${dbtype} \\ --binning-main $binning_main \\ --markers $markers \\ --metagenome $metagenome \\ --binning-column $binning_column \\ - --output-stats "metabin_stats.tsv" \\ - --output-taxonomy "metabin_taxonomy.tsv" \\ - --output-metabins "metabins" - - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + --output-stats "${prefix}.metabin_stats.tsv" \\ + --output-taxonomy "${prefix}.metabin_taxonomy.tsv" \\ + --output-metabins "${prefix}.metabins" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/count_kmers.nf b/modules/local/count_kmers.nf index ff114fa0c..82ec8efcd 100644 --- a/modules/local/count_kmers.nf +++ b/modules/local/count_kmers.nf @@ -1,15 +1,9 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) process COUNT_KMERS { tag "Counting ${params.kmer_size}-mers for ${meta.id}" label 'process_medium' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - conda (params.enable_conda ? "autometa" : null) + conda "autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa" } else { @@ -20,19 +14,25 @@ process COUNT_KMERS { tuple val(meta), path(metagenome) output: - tuple val(meta), path("kmers.tsv") , emit: counts - path '*.version.txt' , emit: version + tuple val(meta), path("*kmers.tsv") , emit: counts + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def prefix = task.ext.prefix ?: "${meta.id}" """ autometa-kmers \\ --fasta $metagenome \\ - --kmers "kmers.tsv" \\ + --kmers "${prefix}.kmers.tsv" \\ --size "${params.kmer_size}" \\ --cpus "${task.cpus}" \\ --seed 42 - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/diamond_blastp.nf b/modules/local/diamond_blastp.nf index 20db06c68..cc78345ea 100644 --- a/modules/local/diamond_blastp.nf +++ b/modules/local/diamond_blastp.nf @@ -1,23 +1,14 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process DIAMOND_BLASTP { tag "Aligning ORFS in ${meta.id} against ${diamond_database}" label 'process_high' // Old diamond manual suggested *NOT* running in parallel... so we are setting maxForks to 1 here. - // TODO: There appears to be features for multiprocessing available now - // See: https://github.com/bbuchfink/diamond/wiki/6.-Distributed-computing maxForks 1 - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - conda (params.enable_conda ? "bioconda::diamond=2.0.14" : null) + conda "bioconda::diamond=2.1.10" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/diamond:2.0.14--hdcc8f71_0" + container "https://depot.galaxyproject.org/singularity/diamond:2.1.10--h43eeafb_2" } else { - container "quay.io/biocontainers/diamond:2.0.14--hdcc8f71_0" + container "quay.io/biocontainers/diamond:2.1.10--h43eeafb_2" } input: @@ -25,18 +16,25 @@ process DIAMOND_BLASTP { path(diamond_database) output: - tuple val(meta), path("blastp.tsv"), emit: diamond_results - path "*.version.txt" , emit: version + tuple val(meta), path("*blastp.tsv"), emit: diamond_results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" """ - diamond blastp $options.args \\ + diamond blastp $args \\ --query ${protein_fasta} \\ --db ${diamond_database} \\ --threads ${task.cpus} \\ - --out blastp.tsv + --out ${prefix}.blastp.tsv - diamond version | sed 's/^.*diamond version //' > diamond.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS """ } diff --git a/modules/local/diamond_makedb.nf b/modules/local/diamond_makedb.nf index 629d24e12..24c3d36f4 100644 --- a/modules/local/diamond_makedb.nf +++ b/modules/local/diamond_makedb.nf @@ -1,17 +1,9 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -params.nr_dmnd_dir = null -options = initOptions(params.options) process DIAMOND_MAKEDB { tag ' Preparing Diamond database' label 'process_high' - storeDir "${params.nr_dmnd_dir}" - - conda (params.enable_conda ? "bioconda::diamond=2.0.9" : null) + conda "bioconda::diamond=2.0.9" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/diamond:2.0.9--hdcc8f71_0" } else { @@ -23,17 +15,24 @@ process DIAMOND_MAKEDB { val(dbname) output: - path("*.dmnd"), emit: diamond_db - path "*.version.txt" , emit: version + path("*.dmnd") , emit: diamond_db + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def args = task.ext.args ?: '' """ diamond makedb --in ${fasta} \\ - $options.args \\ + $args \\ --threads ${task.cpus} \\ --db ${dbname} - diamond version | sed 's/^.*diamond version //' > diamond.version.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + diamond: \$(diamond --version 2>&1 | tail -n 1 | sed 's/^diamond version //') + END_VERSIONS """ } diff --git a/modules/local/embed_kmers.nf b/modules/local/embed_kmers.nf index 24b603747..145400777 100644 --- a/modules/local/embed_kmers.nf +++ b/modules/local/embed_kmers.nf @@ -1,15 +1,10 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process EMBED_KMERS { tag "PCA dims:${params.pca_dimensions}, dims:${params.embedding_dimensions}, method:${params.embedding_method}, sample:${meta.id}" label 'process_medium' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode +// errorStrategy ignore all + errorStrategy 'ignore' + conda "autometa" - conda (params.enable_conda ? "autometa" : null) if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa" } else { @@ -17,27 +12,33 @@ process EMBED_KMERS { } // Not enough contigs to perform embedding with current parameter settings... - errorStrategy { task.exitStatus in 153 ? 'ignore' : 'terminate' } + // errorStrategy { task.exitStatus in 153 ? 'ignore' : 'terminate' } input: tuple val(meta), path(normalized) output: - tuple val(meta), path("kmers.embedded.tsv") , emit: embedded - path '*.version.txt' , emit: version + tuple val(meta), path("*kmers.embedded.tsv") , emit: embedded + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def prefix = task.ext.prefix ?: "${meta.id}" """ autometa-kmers \\ --norm-output $normalized \\ --pca-dimensions "${params.pca_dimensions}" \\ - --embedding-output "kmers.embedded.tsv" \\ + --embedding-output "${prefix}.kmers.embedded.tsv" \\ --embedding-method "${params.embedding_method}" \\ --embedding-dimensions "${params.embedding_dimensions}" \\ --cpus "${task.cpus}" \\ --seed 42 - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/functions.nf b/modules/local/functions.nf deleted file mode 100644 index da9da093d..000000000 --- a/modules/local/functions.nf +++ /dev/null @@ -1,68 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - if (!args.filename.endsWith('.version.txt')) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } - } -} diff --git a/modules/local/get_genomes_for_mock.nf b/modules/local/get_genomes_for_mock.nf index e7b8903bb..3d6c5c3e2 100644 --- a/modules/local/get_genomes_for_mock.nf +++ b/modules/local/get_genomes_for_mock.nf @@ -1,18 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process GET_GENOMES_FOR_MOCK { - def genome_count = options.args2.tokenize('|').size() - tag "fetching ${genome_count} genomes" - - storeDir = 'mock_data/genomes' cache 'lenient' - conda (params.enable_conda ? "bioconda::emboss=6.6.0" : null) - container "jasonkwan/autometa-nf-modules-get_genomes_for_mock:${params.autometa_image_tag}" + conda "bioconda::emboss=6.6.0" + container "jasonkwan/autometa-nf-modules-get_genomes_for_mock:main" output: path "metagenome.fna.gz", emit: metagenome @@ -21,23 +11,31 @@ process GET_GENOMES_FOR_MOCK { path "assembly_to_locus.txt", emit: assembly_to_locus path "assemblies.txt", emit: assemblies path "assembly_report.txt", emit: assembly_report + path "versions.yml" , emit: versions + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' """ - curl -s ${options.args} > assembly_report.txt + curl -s ${args} > assembly_report.txt - grep -E "${options.args2}" assembly_report.txt |\\ + grep -E "${args2}" assembly_report.txt |\\ awk -F '\\t' '{print \$20}' |\\ sed 's,https://,rsync://,' |\\ - xargs -n 1 -I {} \ - rsync -am \ - --exclude='*_rna_from_genomic.fna.gz' \ - --exclude='*_cds_from_genomic.fna.gz' \ - --include="*_genomic.fna.gz" \ - --include="*_protein.faa.gz" \ - --include='*/' \ + xargs -n 1 -I {} \\ + rsync -am \\ + --exclude='*_rna_from_genomic.fna.gz' \\ + --exclude='*_cds_from_genomic.fna.gz' \\ + --include="*_genomic.fna.gz" \\ + --include="*_protein.faa.gz" \\ + --include='*/' \\ --exclude='*' {} . # "clean_mock_data.sh" is here: ~/Autometa/bin/clean_mock_data.sh clean_mock_data.sh + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rsync: \$(rsync --version | head -n1 | sed 's/^rsync version //' | sed 's/\s.*//') + END_VERSIONS """ } diff --git a/modules/local/get_software_versions.nf b/modules/local/get_software_versions.nf deleted file mode 100644 index ccbf3f873..000000000 --- a/modules/local/get_software_versions.nf +++ /dev/null @@ -1,45 +0,0 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - -/* -This file is left in from the template, that's mainly used for QUAST (http://cab.spbu.ru/software/quast/). -There's a discussion that can be had later about incorporating that module fully or removing the remaining template that feeds into it -*/ - -process GET_SOFTWARE_VERSIONS { - label 'process_low' - publishDir "${params.outdir}", - mode: params.publish_dir_mode, - saveAs: { filename -> saveFiles(filename:filename, options:params.options, publish_dir:'pipeline_info', meta:[:], publish_by_meta:[]) } - - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/python:3.8.3" - } else { - container "quay.io/biocontainers/python:3.8.3" - } - - cache false - - input: - path versions - - output: - path "software_versions.tsv" , emit: tsv - path 'software_versions_mqc.yaml', emit: yaml - path '*.version.txt' , emit: version - - script: - // Add soft-links to original FastQs for consistent naming in pipeline - def software = getSoftwareName(task.process) - """ - echo $workflow.manifest.version > pipeline.version.txt - echo $workflow.nextflow.version > nextflow.version.txt - scrape_software_versions.py &> software_versions_mqc.yaml - - echo "make linter happy" > ${software}.version.txt - """ -} diff --git a/modules/local/hmmer_hmmsearch.nf b/modules/local/hmmer_hmmsearch.nf index 7e49c6e2f..60972da59 100644 --- a/modules/local/hmmer_hmmsearch.nf +++ b/modules/local/hmmer_hmmsearch.nf @@ -9,19 +9,11 @@ the results of this process ======================= */ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process HMMER_HMMSEARCH { tag "Annotating ORFs in $meta.id" label 'process_medium' - // no publishdir - - conda (params.enable_conda ? "bioconda::hmmer=3.3.2" : null) + conda "bioconda::hmmer=3.3.2" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/hmmer:3.3.2--h1b792b2_1" } else { @@ -34,20 +26,30 @@ process HMMER_HMMSEARCH { output: tuple val(meta), path("*.domtblout"), emit: domtblout - path "*.version.txt" , emit: version + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) - def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" - def fastacmd = fasta.getExtension() == 'gz' ? "gunzip -c $fasta" : "cat $fasta" + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' """ + # hmmsearch can'ts use or pipe in gzipped fasta + + zcat "${fasta}" > temp.fa + hmmsearch \\ --domtblout "${hmm.simpleName}.domtblout" \\ - ${options.args} \\ - ${options.args2} \\ - $hmm \\ - $fasta > /dev/null 2>&1 - - echo \$(hmmalign -h | grep -o '^# HMMER [0-9.]*') | sed 's/^# HMMER *//' > HMMER.version.txt - """ + --cpu $task.cpus \\ + $args \\ + $args2 \\ + "${hmm}" \\ + temp.fa > /dev/null 2>&1 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + hmmer: \$(hmmsearch -h | grep -o '^# HMMER [0-9.]*' | sed 's/^# HMMER *//') + END_VERSIONS + """ } diff --git a/modules/local/hmmer_hmmsearch_filter.nf b/modules/local/hmmer_hmmsearch_filter.nf index 26d1d8cee..a306d6263 100644 --- a/modules/local/hmmer_hmmsearch_filter.nf +++ b/modules/local/hmmer_hmmsearch_filter.nf @@ -7,12 +7,6 @@ TODO: Not yet implemented */ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process HMMER_HMMSEARCH_FILTER { tag "Filtering marker hmms in ${meta.id}" label 'process_medium' @@ -20,9 +14,8 @@ process HMMER_HMMSEARCH_FILTER { // if ( params.num_splits < 2 ) { // if running in parallel, the results are published from the process // that merges the individual results from this process - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - conda (params.enable_conda ? "autometa" : null) + conda "autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -35,11 +28,12 @@ process HMMER_HMMSEARCH_FILTER { output: tuple val(meta), path("markers.tsv"), emit: markers_tsv - path "*.version.txt" , emit: version + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) - def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" """ autometa-hmmsearch-filter \\ --domtblout "$domtblout" \\ @@ -47,6 +41,9 @@ process HMMER_HMMSEARCH_FILTER { --seqdb "$fasta" \\ --out "markers.tsv" - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/length_table.nf b/modules/local/length_table.nf deleted file mode 100644 index a94c76833..000000000 --- a/modules/local/length_table.nf +++ /dev/null @@ -1,41 +0,0 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - -process LENGTH_TABLE { - tag "${meta.id}" - label 'process_low' - - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "bioconda::autometa" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" - } else { - container "jasonkwan/autometa:${params.autometa_image_tag}" - } - - input: - tuple val(meta), path(metagenome) - - output: - tuple val(meta), path("lengths.tsv"), emit: lengths - path '*.version.txt' , emit: version - - script: - def software = getSoftwareName(task.process) - """ - #!/usr/bin/env python - from Bio import SeqIO - import pandas as pd - - seqs = {record.id: len(record.seq) for record in SeqIO.parse(${metagenome}, "fasta")} - lengths = pd.Series(seqs, name="length") - lengths.index.name = "contig" - lengths.to_csv(lengths.tsv, sep="\t", index=True, header=True) - - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt - """ -} diff --git a/modules/local/majority_vote.nf b/modules/local/majority_vote.nf index 6271b7bd2..83a34a5d0 100644 --- a/modules/local/majority_vote.nf +++ b/modules/local/majority_vote.nf @@ -1,16 +1,9 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process MAJORITY_VOTE { tag "Performing taxon majority vote on ${meta.id}" label 'process_medium' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - conda (params.enable_conda ? "bioconda::autometa" : null) + conda "bioconda::autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -19,21 +12,28 @@ process MAJORITY_VOTE { input: tuple val(meta), path(lca) - path(ncbi_tax_dir) + path taxdump_files // instead of passing to --dbdir, stage and pass '.' + val dbtype output: - tuple val(meta), path("votes.tsv"), emit: votes - path '*.version.txt' , emit: version + tuple val(meta), path("*votes.tsv") , emit: votes + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def prefix = task.ext.prefix ?: "${meta.id}" """ autometa-taxonomy-majority-vote \\ --lca ${lca} \\ - --output votes.tsv \\ - --dbdir "${ncbi_tax_dir}" \\ - --dbtype ncbi - - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + --output ${prefix}.votes.tsv \\ + --dbdir . \\ + --dbtype ${dbtype} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/markers.nf b/modules/local/markers.nf index 5835735b7..25175761c 100644 --- a/modules/local/markers.nf +++ b/modules/local/markers.nf @@ -1,17 +1,11 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' -params.options = [:] -options = initOptions(params.options) // TODO: For faster results/less I/O this could be replaced with hmmsearch process MARKERS { - tag "Finding markers for ${meta.id}" + tag "Finding ${meta.taxon} markers for ${meta.id}" label "process_medium" - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "autometa" : null) + conda "autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -23,33 +17,33 @@ process MARKERS { input: tuple val(meta), path(orfs) - //path(hmmdb) currently only inside docker - //path(cutoffs) currently only inside docker output: - tuple val(meta), path("${params.kingdom}.markers.tsv"), emit: markers_tsv - tuple val(meta), path("${params.kingdom}.hmmscan.tsv"), emit: hmmscan_tsv - path '*.version.txt' , emit: version + tuple val(meta), path("*.markers.tsv") , emit: markers_tsv + tuple val(meta), path("*.hmmscan.tsv") , emit: hmmscan_tsv + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) - if (params.enable_conda) - """ - exit 1 - """ - else + def prefix = task.ext.prefix ?: "${meta.id}" + def kingdom = meta.taxon """ autometa-markers \\ --orfs $orfs \\ - --hmmscan ${params.kingdom}.hmmscan.tsv \\ - --out ${params.kingdom}.markers.tsv \\ - --kingdom ${params.kingdom} \\ + --hmmscan ${prefix}.${kingdom}.hmmscan.tsv \\ + --out ${prefix}.${kingdom}.markers.tsv \\ + --kingdom ${kingdom} \\ --parallel \\ --cpus ${task.cpus} \\ --seed 42 \\ - --hmmdb "/scratch/dbs/markers/${params.kingdom}.single_copy.hmm" \\ - --cutoffs "/scratch/dbs/markers/${params.kingdom}.single_copy.cutoffs" + --hmmdb "/scratch/dbs/markers/${kingdom}.single_copy.hmm" \\ + --cutoffs "/scratch/dbs/markers/${kingdom}.single_copy.cutoffs" - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/merge_fasta.nf b/modules/local/merge_fasta.nf index 461ca28d9..38e429611 100644 --- a/modules/local/merge_fasta.nf +++ b/modules/local/merge_fasta.nf @@ -1,16 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process MERGE_FASTA { tag "Merging ${meta.id} FASTA" label 'process_low' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "bioconda::seqkit=0.16.1" : null) + conda "bioconda::seqkit=0.16.1" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/seqkit:0.16.1--h9ee0642_0" } else { @@ -23,14 +15,20 @@ process MERGE_FASTA { output: tuple val(meta), path("${meta.id}.${extension}"), emit: merged - path '*.version.txt' , emit: version + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) """ # If errors occur because of issues with symlinks, # try: cat * | seqkit sort -n > "${meta.id}.${extension}" seqkit sort -n * > "${meta.id}.${extension}" - seqkit version | sed 's/seqkit v//g' > ${software}.version.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$( seqkit | sed '3!d; s/Version: //' ) + END_VERSIONS """ } diff --git a/modules/local/merge_tsv.nf b/modules/local/merge_tsv.nf index b7ebccedb..ed17e58e0 100644 --- a/modules/local/merge_tsv.nf +++ b/modules/local/merge_tsv.nf @@ -1,16 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process MERGE_TSV_WITH_HEADERS { tag "Merging files from parallel split for ${meta.id}" label 'process_low' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "bioconda::autometa" : null) + conda "bioconda::autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" @@ -26,8 +18,10 @@ process MERGE_TSV_WITH_HEADERS { tuple val(meta), path("${meta.id}.${extension}"), emit: merged_tsv + when: + task.ext.when == null || task.ext.when + script: - def software = getSoftwareName(task.process) """ awk 'FNR==1 && NR!=1{next;}{print}' *.tsv > "${meta.id}.${extension}" """ diff --git a/modules/local/mock_data_reporter.nf b/modules/local/mock_data_reporter.nf index 0c91af41d..cd471f68c 100644 --- a/modules/local/mock_data_reporter.nf +++ b/modules/local/mock_data_reporter.nf @@ -1,65 +1,31 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process MOCK_DATA_REPORT { tag 'Preparing mock data report' label 'process_low' - publishDir "${options.publish_dir}", mode: params.publish_dir_mode - - container "jasonkwan/autometa-nf-modules-mock_data_reporter:${params.autometa_image_tag}" + container "jasonkwan/autometa-nf-modules-mock_data_reporter:main" input: tuple val(meta), path(bins_path), path(assembly_to_locus_path), path(assembly_report_path) path(rmarkdown_file) output: - tuple val(meta), path("*.html"), emit: results + tuple val(meta), path("*.html") , emit: results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: """ - #!/usr/bin/env Rscript - - packages <- c("markdown","data.table", "ggplot2", "plotly", "crosstalk", "magrittr", "DT", "stringi") - - for (i in packages) { - if (!requireNamespace(i)) { - install.packages(i) - } - library(i, character.only = T) - } - - rmarkdown::render( - input="${rmarkdown_file}", - params=list( - bins_path="${bins_path}", - assembly_to_locus_path="${assembly_to_locus_path}", - assembly_report_path="${assembly_report_path}", - genus=FALSE - ), - knit_root_dir=getwd(), - output_dir=getwd(), - output_file="mock_data_report_by_assembly.html" - ) + mock_data_report.R ${rmarkdown_file} ${bins_path} ${assembly_to_locus_path} ${assembly_report_path} - rmarkdown::render( - input="${rmarkdown_file}", - params=list( - bins_path= "${bins_path}", - assembly_to_locus_path = "${assembly_to_locus_path}", - assembly_report_path = "${assembly_report_path}", - genus=TRUE - ), - knit_root_dir=getwd(), - output_dir=getwd(), - output_file="mock_data_report_by_genus.html" - ) + cat <<-END_VERSIONS > versions.yml + "${task.process}": + R: 'For R and packages, see docker: jasonkwan/autometa-nf-modules-mock_data_reporter:main' + END_VERSIONS """ } diff --git a/modules/local/normalize_kmers.nf b/modules/local/normalize_kmers.nf index b559e52df..fcd1c0eec 100644 --- a/modules/local/normalize_kmers.nf +++ b/modules/local/normalize_kmers.nf @@ -1,15 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process NORMALIZE_KMERS { tag "method:${params.norm_method}, sample:${meta.id}" label 'process_medium' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - conda (params.enable_conda ? "autometa" : null) + conda "autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa" } else { @@ -20,18 +13,24 @@ process NORMALIZE_KMERS { tuple val(meta), path(counts) output: - tuple val(meta), path("kmers.normalized.tsv"), emit: normalized - path '*.version.txt' , emit: version + tuple val(meta), path("*kmers.normalized.tsv"), emit: normalized + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def prefix = task.ext.prefix ?: "${meta.id}" """ autometa-kmers \\ --kmers $counts \\ - --norm-output "kmers.normalized.tsv" \\ - --norm-method "${params.norm_method}" \\ + --norm-output ${prefix}.kmers.normalized.tsv \\ + --norm-method ${params.norm_method} \\ --seed 42 - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/parse_bed.nf b/modules/local/parse_bed.nf index c92a75560..49d1f03df 100644 --- a/modules/local/parse_bed.nf +++ b/modules/local/parse_bed.nf @@ -1,16 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process PARSE_BED { tag "$meta.id" label 'process_low' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "bioconda::autometa" : null) + conda "bioconda::autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/bedtools:2.30.0--hc088bd4_0" } else { @@ -21,22 +13,26 @@ process PARSE_BED { tuple val(meta), path(bed) output: - tuple val(meta), path("coverage.tsv"), emit: coverage - path "*.version.txt" , emit: version + tuple val(meta), path("*coverage.tsv"), emit: coverage + path "versions.yml" , emit: versions when: meta.cov_from_assembly.equals('0') + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def prefix = task.ext.prefix ?: "${meta.id}" """ # NOTE: Here we supply an argument to ibam to prevent raising an error # However, bed is the only arg required for nextflow since bed is generated from BEDTOOLS_GENOMECOV... autometa-bedtools-genomecov \\ --ibam . \\ --bed $bed \\ - --output coverage.tsv + --output ${prefix}.coverage.tsv - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/prepare_lca.nf b/modules/local/prepare_lca.nf index ce712cd3f..2d898f3a1 100644 --- a/modules/local/prepare_lca.nf +++ b/modules/local/prepare_lca.nf @@ -1,40 +1,42 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process PREPARE_LCA { - tag "Preparing db cache from ${blastdb_dir}" + tag "Preparing db cache for ${dbtype}" label 'process_medium' - conda (params.enable_conda ? "bioconda::autometa" : null) + conda "bioconda::autometa" + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { container "jasonkwan/autometa:${params.autometa_image_tag}" } - storeDir 'db/lca' - cache 'lenient' - input: - path(blastdb_dir) + path taxdump_files // instead of passing to --dbdir, stage and pass '.' + val dbtype output: path "cache" , emit: cache - path '*.version.txt' , emit: version + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + // storeDir = (dbtype == 'gtdb') ? params.gtdb_dir : (dbtype == 'ncbi' ? params.lca_dir : null) script: - def software = getSoftwareName(task.process) """ + # https://autometa.readthedocs.io/en/latest/scripts/taxonomy/lca.html autometa-taxonomy-lca \\ --blast . \\ --lca-output . \\ - --dbdir ${blastdb_dir} \\ - --dbtype ncbi \\ + --dbdir . \\ + --dbtype ${dbtype} \\ --cache cache \\ --only-prepare-cache - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/reduce_lca.nf b/modules/local/reduce_lca.nf index 031564557..2d47033bf 100644 --- a/modules/local/reduce_lca.nf +++ b/modules/local/reduce_lca.nf @@ -1,16 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process REDUCE_LCA { - tag "Finding LCA for ${meta.id}" + tag "Finding ${dbtype} LCA for ${meta.id}" label 'process_medium' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "bioconda::autometa" : null) + conda "bioconda::autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -19,27 +11,36 @@ process REDUCE_LCA { input: tuple val(meta), path(blast) - path(blastdb_dir) - path(lca_cache) + path taxdump_files // instead of passing to --dbdir, stage and pass '.' + path lca_cache + path prot_accession2taxid + val dbtype output: - tuple val(meta), path("lca.tsv"), emit: lca - path "lca_error_taxids.tsv" , emit: error_taxids - path "sseqid2taxid.tsv" , emit: sseqid_to_taxids - path '*.version.txt' , emit: version + tuple val(meta), path("*lca.tsv") , emit: lca + tuple val(meta), path("*lca_error_taxids.tsv") , emit: error_taxids + tuple val(meta), path("*sseqid2taxid.tsv") , emit: sseqid_to_taxids + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def prefix = task.ext.prefix ?: "${meta.id}" """ autometa-taxonomy-lca \\ --blast ${blast} \\ - --dbdir ${blastdb_dir} \\ - --dbtype ncbi \\ + --dbdir . \\ + --dbtype ${dbtype} \\ --cache ${lca_cache} \\ - --lca-error-taxids lca_error_taxids.tsv \\ - --sseqid2taxid-output sseqid2taxid.tsv \\ - --lca-output lca.tsv - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + --lca-error-taxids ${prefix}.lca_error_taxids.tsv \\ + --sseqid2taxid-output ${prefix}.sseqid2taxid.tsv \\ + --lca-output ${prefix}.lca.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/samplesheet_check.nf b/modules/local/samplesheet_check.nf index fdd2ce933..2458abe2d 100644 --- a/modules/local/samplesheet_check.nf +++ b/modules/local/samplesheet_check.nf @@ -2,9 +2,8 @@ process SAMPLESHEET_CHECK { tag "$samplesheet" label 'process_low' - publishDir "${params.outdir}", mode: params.publish_dir_mode - conda (params.enable_conda ? "conda-forge::python=3.8.3" : null) + conda "conda-forge::python=3.8.3" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/python:3.8.3" } else { @@ -15,10 +14,19 @@ process SAMPLESHEET_CHECK { path samplesheet output: - path '*.csv' + path '*.csv' , emit: csv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: """ check_samplesheet.py $samplesheet samplesheet.valid.csv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version 2>&1 | tail -n 1 | sed 's/^Python //') + END_VERSIONS """ } diff --git a/modules/local/samtools_view_sort.nf b/modules/local/samtools_view_sort.nf index af83e27d9..3c4d97728 100644 --- a/modules/local/samtools_view_sort.nf +++ b/modules/local/samtools_view_sort.nf @@ -1,16 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process SAMTOOLS_VIEW_AND_SORT { tag "$meta.id" label 'process_medium' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "bioconda::samtools=1.13" : null) + conda "bioconda::samtools=1.13" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/samtools:1.12--hd5e65b6_0" } else { @@ -21,18 +13,25 @@ process SAMTOOLS_VIEW_AND_SORT { tuple val(meta), path(sam) output: - tuple val(meta), path("alignments.bam"), emit: bam - path "*.version.txt" , emit: version + tuple val(meta), path("*.alignments.bam") , emit: bam + path "versions.yml" , emit: versions when: meta.cov_from_assembly.equals('0') + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ - samtools view ${options.args} -@ ${task.cpus} -bS ${sam} \\ - | samtools sort ${options.args2} -@ ${task.cpus} -o alignments.bam + samtools view ${args} -@ ${task.cpus} -bS ${sam} \\ + | samtools sort ${args2} -@ ${task.cpus} -o ${prefix}.alignments.bam - echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS """ } diff --git a/modules/local/seqkit_filter.nf b/modules/local/seqkit_filter.nf index 45d4bb9fe..e32bf7908 100644 --- a/modules/local/seqkit_filter.nf +++ b/modules/local/seqkit_filter.nf @@ -1,16 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process SEQKIT_FILTER { tag "Removing contigs < ${params.length_cutoff} bp, from ${meta.id}" label 'process_high' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "bioconda::seqkit=0.16.1" : null) + conda "bioconda::seqkit=0.16.1" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/seqkit:0.16.1--h9ee0642_0" } else { @@ -21,30 +13,38 @@ process SEQKIT_FILTER { tuple val(meta), path(metagenome) output: - tuple val(meta), path("filtered.fna") , emit: fasta - tuple val(meta), path("gc_content.tsv"), emit: gc_content - path '*.version.txt' , emit: version + tuple val(meta), path("*filtered.fna") , emit: fasta + tuple val(meta), path("*gc_content.tsv") , emit: gc_content + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) def metagenomecmd = metagenome.getExtension() == 'gz' ? "gunzip -c $metagenome" : "cat $metagenome" + def prefix = task.ext.prefix ?: "${meta.id}" """ # filter contigs by specified length + # `seqkit seq -i` "print ID instead of full head" ${metagenomecmd} | \\ - seqkit seq -j ${task.cpus} -m ${params.length_cutoff} | \\ - seqkit sort -n > "filtered.fna" + seqkit seq -i -j ${task.cpus} -m ${params.length_cutoff} | \\ + seqkit sort -n > "${prefix}.filtered.fna" # calculate gc content - seqkit fx2tab -j ${task.cpus} -n -lg "filtered.fna" > temp + seqkit fx2tab -j ${task.cpus} -n -lg "${prefix}.filtered.fna" > temp # Extract columns, create tsv awk '{FS="\\t"; OFS="\\t"; print \$1,\$3,\$2}' temp > temp2 - echo -e "contig\\tgc_content\\tlength" | cat - temp2 > "gc_content.tsv" + echo -e "contig\\tgc_content\\tlength" | cat - temp2 > "${prefix}.gc_content.tsv" # Remove temporary files rm temp rm temp2 - seqkit version | sed 's/seqkit v//g' > ${software}.version.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$( seqkit | sed '3!d; s/Version: //' ) + END_VERSIONS """ } diff --git a/modules/local/seqkit_split.nf b/modules/local/seqkit_split.nf index 649e42178..1adbe3028 100644 --- a/modules/local/seqkit_split.nf +++ b/modules/local/seqkit_split.nf @@ -1,14 +1,9 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - +// Not implemented yet but thought was to split to parallelize prodigal process SEQKIT_SPLIT { tag "Splitting $meta.id for parallel processing" label 'process_medium' - conda (params.enable_conda ? "bioconda::seqkit=0.16.1" : null) + conda "bioconda::seqkit=0.16.1" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/seqkit:0.16.1--h9ee0642_0" } else { @@ -20,19 +15,25 @@ process SEQKIT_SPLIT { output: tuple val(meta), path("outfolder/*") , emit: fasta - path "*.version.txt" , emit: version + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) - def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' """ seqkit \\ split \\ ${fasta} \\ - ${options.args} \\ - ${options.args2} \\ + ${args} \\ + ${args2} \\ -O outfolder - seqkit version | sed 's/seqkit v//g' > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + seqkit: \$( seqkit | sed '3!d; s/Version: //' ) + END_VERSIONS """ } diff --git a/modules/local/spades_kmer_coverage.nf b/modules/local/spades_kmer_coverage.nf index 6e803c6ef..d82cecfa3 100644 --- a/modules/local/spades_kmer_coverage.nf +++ b/modules/local/spades_kmer_coverage.nf @@ -1,16 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process SPADES_KMER_COVERAGE { tag "${meta.id}" label 'process_low' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "autometa" : null) + conda "autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/YOUR-TOOL-HERE" } else { @@ -20,22 +12,24 @@ process SPADES_KMER_COVERAGE { input: tuple val(meta), path(metagenome) - output: - tuple val(meta), path("coverage.tsv") , emit: coverage - path '*.version.txt' , emit: version + tuple val(meta), path("*coverage.tsv") , emit: coverage + path 'versions.yml' , emit: versions when: meta.cov_from_assembly.equals('spades') script: - def software = getSoftwareName(task.process) + def prefix = task.ext.prefix ?: "${meta.id}" """ autometa-coverage \\ --assembly ${metagenome} \\ --from-spades \\ - --out "coverage.tsv" + --out "${prefix}.coverage.tsv" - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/split_kingdoms.nf b/modules/local/split_kingdoms.nf index 396cd828e..235feba6e 100644 --- a/modules/local/split_kingdoms.nf +++ b/modules/local/split_kingdoms.nf @@ -1,16 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process SPLIT_KINGDOMS { tag "Splitting votes into kingdoms for ${meta.id}" label 'process_medium' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "bioconda::autometa" : null) + conda "bioconda::autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -19,26 +11,47 @@ process SPLIT_KINGDOMS { input: tuple val(meta), path(assembly), path(votes) - path(ncbi_tax_dir) + path taxdump_files // instead of passing to --dbdir, stage and pass '.' + val dbtype output: - tuple val(meta), path("taxonomy.tsv"), emit: taxonomy - tuple val(meta), path("bacteria.fna"), emit: bacteria, optional: true - tuple val(meta), path("archaea.fna") , emit: archaea, optional: true - tuple val(meta), path("*.fna") , emit: kingdoms, optional: true - path '*.version.txt' , emit: version + tuple val(meta), path("${dbtype}/*.taxonomy.tsv") , emit: taxonomy + tuple val(meta), path("${dbtype}/*.fna") , emit: fna + tuple val(meta), path("${dbtype}/*.unclassified.fna") , emit: unclassified_fna, optional: true + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def prefix = task.ext.prefix ?: "${meta.id}" """ + mkdir ${dbtype} autometa-taxonomy \\ --votes "${votes}" \\ - --output . \\ + --output "./${dbtype}" \\ --split-rank-and-write superkingdom \\ --assembly "${assembly}" \\ - --dbdir "${ncbi_tax_dir}" \\ - --dbtype ncbi - - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + --dbdir . \\ + --dbtype ${dbtype} + + # prefix all files in temp with the prefix + for file in ${dbtype}/*; do + mv "\$file" "${dbtype}/${prefix}.\$(basename \$file)" + done + + # Move .unclassified.fna files to a separate location for separate emitting + mkdir -p ${dbtype}_unclassified_fna + + for file in ${dbtype}/${prefix}.unclassified.*; do + if [ -e "\$file" ]; then + mv "\$file" ${dbtype}_unclassified_fna/ + fi + done + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/local/unclustered_recruitment.nf b/modules/local/unclustered_recruitment.nf index 460bf1b9a..bb0fa793b 100644 --- a/modules/local/unclustered_recruitment.nf +++ b/modules/local/unclustered_recruitment.nf @@ -1,16 +1,8 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - process RECRUIT { tag "sample:${meta.id}, classifier:${params.classification_method}, kmer dims:${params.classification_kmer_pca_dimensions}" label 'process_high' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? "autometa" : null) + conda "autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -21,16 +13,19 @@ process RECRUIT { errorStrategy { task.exitStatus in 204 ? 'ignore' : 'terminate' } input: - tuple val(meta), path(kmers), path(coverage), path(binning), path(markers), path(taxonomy) + tuple val(meta), path(kmers), path(coverage), path(markers), path(taxonomy), path(binning) output: - tuple val(meta), path("${params.kingdom}.recruitment.tsv.gz") , emit: binning, optional: true - tuple val(meta), path("${params.kingdom}.recruitment.main.tsv.gz") , emit: main, optional: true - tuple val(meta), path("${params.kingdom}.recruitment.features.tsv.gz"), emit: features, optional: true - path '*.version.txt' , emit: version + tuple val(meta), path("${params.kingdom}.recruitment.tsv.gz") , emit: binning, optional: true + tuple val(meta), path("${params.kingdom}.recruitment.main.tsv.gz") , emit: main, optional: true + tuple val(meta), path("${params.kingdom}.recruitment.features.tsv.gz") , emit: features, optional: true + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when script: - def software = getSoftwareName(task.process) + def prefix = task.ext.prefix ?: "${meta.id}" if (!params.taxonomy_aware) """ autometa-unclustered-recruitment \\ @@ -41,27 +36,33 @@ process RECRUIT { --coverage $coverage \\ --binning $binning \\ --markers $markers \\ - --output-binning ${params.kingdom}.recruitment.tsv.gz \\ - --output-main ${params.kingdom}.recruitment.main.tsv.gz \\ - --output-features ${params.kingdom}.recruitment.features.tsv.gz + --output-binning ${prefix}.${params.kingdom}.recruitment.tsv.gz \\ + --output-main ${prefix}.${params.kingdom}.recruitment.main.tsv.gz \\ + --output-features ${prefix}.${params.kingdom}.recruitment.features.tsv.gz - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ else """ autometa-unclustered-recruitment \\ - --classifier ${params.classification_method} \\ - --kmer-dimensions ${params.classification_kmer_pca_dimensions} \\ + --classifier ${prefix}.${params.classification_method} \\ + --kmer-dimensions ${prefix}.${params.classification_kmer_pca_dimensions} \\ --seed 42 \\ --taxonomy $taxonomy \\ --kmers $kmers \\ --coverage $coverage \\ --binning $binning \\ --markers $markers \\ - --output-binning ${params.kingdom}.recruitment.tsv.gz \\ - --output-main ${params.kingdom}.recruitment.main.tsv.gz \\ - --output-features ${params.kingdom}.recruitment.features.tsv.gz + --output-binning ${prefix}.${params.kingdom}.recruitment.tsv.gz \\ + --output-main ${prefix}.${params.kingdom}.recruitment.main.tsv.gz \\ + --output-features ${prefix}.${params.kingdom}.recruitment.features.tsv.gz - autometa --version | sed -e "s/autometa: //g" > ${software}.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + END_VERSIONS """ } diff --git a/modules/nf-core/bowtie2/align/main.nf b/modules/nf-core/bowtie2/align/main.nf new file mode 100644 index 000000000..276f511e5 --- /dev/null +++ b/modules/nf-core/bowtie2/align/main.nf @@ -0,0 +1,71 @@ +process BOWTIE2_ALIGN { + tag "$meta.id" + label "process_high" + + conda "bioconda::bowtie2=2.4.4 bioconda::samtools=1.16.1 conda-forge::pigz=2.6" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:a0ffedb52808e102887f6ce600d092675bf3528a-0' : + 'quay.io/biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:a0ffedb52808e102887f6ce600d092675bf3528a-0' }" + + input: + tuple val(meta) , path(reads) + tuple val(meta2), path(index) + val save_unaligned + val sort_bam + + output: + tuple val(meta), path("*.bam") , emit: bam + tuple val(meta), path("*.log") , emit: log + tuple val(meta), path("*fastq.gz"), emit: fastq, optional:true + path "versions.yml" , emit: versionss + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: "" + def args2 = task.ext.args2 ?: "" + def prefix = task.ext.prefix ?: "${meta.id}" + + def unaligned = "" + def reads_args = "" + if (meta.single_end) { + unaligned = save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : "" + reads_args = "-U ${reads}" + } else { + unaligned = save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : "" + reads_args = "-1 ${reads[0]} -2 ${reads[1]}" + } + + def samtools_command = sort_bam ? 'sort' : 'view' + + """ + INDEX=`find -L ./ -name "*.rev.1.bt2" | sed "s/\\.rev.1.bt2\$//"` + [ -z "\$INDEX" ] && INDEX=`find -L ./ -name "*.rev.1.bt2l" | sed "s/\\.rev.1.bt2l\$//"` + [ -z "\$INDEX" ] && echo "Bowtie2 index files not found" 1>&2 && exit 1 + + bowtie2 \\ + -x \$INDEX \\ + $reads_args \\ + --threads $task.cpus \\ + $unaligned \\ + $args \\ + 2> ${prefix}.bowtie2.log \\ + | samtools $samtools_command $args2 --threads $task.cpus -o ${prefix}.bam - + + if [ -f ${prefix}.unmapped.fastq.1.gz ]; then + mv ${prefix}.unmapped.fastq.1.gz ${prefix}.unmapped_1.fastq.gz + fi + + if [ -f ${prefix}.unmapped.fastq.2.gz ]; then + mv ${prefix}.unmapped.fastq.2.gz ${prefix}.unmapped_2.fastq.gz + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + bowtie2: \$(echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//') + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/bowtie2/align/meta.yml b/modules/nf-core/bowtie2/align/meta.yml new file mode 100644 index 000000000..c8e9a0012 --- /dev/null +++ b/modules/nf-core/bowtie2/align/meta.yml @@ -0,0 +1,67 @@ +name: bowtie2_align +description: Align reads to a reference genome using bowtie2 +keywords: + - align + - map + - fasta + - fastq + - genome + - reference +tools: + - bowtie2: + description: | + Bowtie 2 is an ultrafast and memory-efficient tool for aligning + sequencing reads to long reference sequences. + homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml + documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml + doi: 10.1038/nmeth.1923 + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test', single_end:false ] + - index: + type: file + description: Bowtie2 genome index files + pattern: "*.ebwt" + - save_unaligned: + type: boolean + description: | + Save reads that do not map to the reference (true) or discard them (false) + (default: false) + - sort_bam: + type: boolean + description: use samtools sort (true) or samtools view (false) + pattern: "true or false" +output: + - bam: + type: file + description: Output BAM file containing read alignments + pattern: "*.{bam}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - fastq: + type: file + description: Unaligned FastQ files + pattern: "*.fastq.gz" + - log: + type: file + description: Aligment log + pattern: "*.log" +authors: + - "@joseespinosa" + - "@drpatelh" diff --git a/modules/nf-core/custom/dumpsoftwareversions/main.nf b/modules/nf-core/custom/dumpsoftwareversions/main.nf new file mode 100644 index 000000000..a941c4ee1 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/main.nf @@ -0,0 +1,24 @@ +process CUSTOM_DUMPSOFTWAREVERSIONS { + label 'process_single' + + // Requires `pyyaml` which does not have a dedicated container but is in the MultiQC container + conda "bioconda::multiqc=1.13" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/multiqc:1.13--pyhdfd78af_0' : + 'quay.io/biocontainers/multiqc:1.13--pyhdfd78af_0' }" + + input: + path versions + + output: + path "software_versions.yml" , emit: yml + path "software_versions_mqc.yml", emit: mqc_yml + path "versions.yml" , emit: versionss + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + template 'dumpsoftwareversions.py' +} diff --git a/modules/nf-core/custom/dumpsoftwareversions/meta.yml b/modules/nf-core/custom/dumpsoftwareversions/meta.yml new file mode 100644 index 000000000..60b546a01 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/meta.yml @@ -0,0 +1,34 @@ +name: custom_dumpsoftwareversions +description: Custom module used to dump software versions within the nf-core pipeline template +keywords: + - custom + - version +tools: + - custom: + description: Custom module used to dump software versions within the nf-core pipeline template + homepage: https://github.com/nf-core/tools + documentation: https://github.com/nf-core/tools + licence: ["MIT"] +input: + - versions: + type: file + description: YML file containing software versions + pattern: "*.yml" + +output: + - yml: + type: file + description: Standard YML file containing software versions + pattern: "software_versions.yml" + - mqc_yml: + type: file + description: MultiQC custom content YML file containing software versions + pattern: "software_versions_mqc.yml" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + +authors: + - "@drpatelh" + - "@grst" diff --git a/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py new file mode 100755 index 000000000..da0334085 --- /dev/null +++ b/modules/nf-core/custom/dumpsoftwareversions/templates/dumpsoftwareversions.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python + + +"""Provide functions to merge multiple versions.yml files.""" + + +import yaml +import platform +from textwrap import dedent + + +def _make_versions_html(versions): + """Generate a tabular HTML output of all versions for MultiQC.""" + html = [ + dedent( + """\\ + + + + + Process Name + Software + Version + + + """ + ) + ] + for process, tmp_versions in sorted(versions.items()): + html.append("") + for i, (tool, version) in enumerate(sorted(tmp_versions.items())): + html.append( + dedent( + f"""\\ + + {process if (i == 0) else ''} + {tool} + {version} + + """ + ) + ) + html.append("") + html.append("") + return "\\n".join(html) + + +def main(): + """Load all version files and generate merged output.""" + versions_this_module = {} + versions_this_module["${task.process}"] = { + "python": platform.python_version(), + "yaml": yaml.__version__, + } + + with open("$versions") as f: + versions_by_process = yaml.load(f, Loader=yaml.BaseLoader) | versions_this_module + + # aggregate versions by the module name (derived from fully-qualified process name) + versions_by_module = {} + for process, process_versions in versions_by_process.items(): + module = process.split(":")[-1] + try: + if versions_by_module[module] != process_versions: + raise AssertionError( + "We assume that software versions are the same between all modules. " + "If you see this error-message it means you discovered an edge-case " + "and should open an issue in nf-core/tools. " + ) + except KeyError: + versions_by_module[module] = process_versions + + versions_by_module["Workflow"] = { + "Nextflow": "$workflow.nextflow.version", + "$workflow.manifest.name": "$workflow.manifest.version", + } + + versions_mqc = { + "id": "software_versions", + "section_name": "${workflow.manifest.name} Software Versions", + "section_href": "https://github.com/${workflow.manifest.name}", + "plot_type": "html", + "description": "are collected at run time from the software output.", + "data": _make_versions_html(versions_by_module), + } + + with open("software_versions.yml", "w") as f: + yaml.dump(versions_by_module, f, default_flow_style=False) + with open("software_versions_mqc.yml", "w") as f: + yaml.dump(versions_mqc, f, default_flow_style=False) + + with open("versions.yml", "w") as f: + yaml.dump(versions_this_module, f, default_flow_style=False) + + +if __name__ == "__main__": + main() diff --git a/modules/nf-core/modules/bowtie2/align/functions.nf b/modules/nf-core/modules/bowtie2/align/functions.nf deleted file mode 100644 index da9da093d..000000000 --- a/modules/nf-core/modules/bowtie2/align/functions.nf +++ /dev/null @@ -1,68 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - if (!args.filename.endsWith('.version.txt')) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } - } -} diff --git a/modules/nf-core/modules/bowtie2/align/main.nf b/modules/nf-core/modules/bowtie2/align/main.nf deleted file mode 100644 index ad6ed92eb..000000000 --- a/modules/nf-core/modules/bowtie2/align/main.nf +++ /dev/null @@ -1,71 +0,0 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - -process BOWTIE2_ALIGN { - tag "$meta.id" - label 'process_high' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - conda (params.enable_conda ? 'bioconda::bowtie2=2.4.2 bioconda::samtools=1.11 conda-forge::pigz=2.3.4' : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:577a697be67b5ae9b16f637fd723b8263a3898b3-0" - } else { - container "quay.io/biocontainers/mulled-v2-ac74a7f02cebcfcc07d8e8d1d750af9c83b4d45a:577a697be67b5ae9b16f637fd723b8263a3898b3-0" - } - - input: - tuple val(meta), path(reads) - path index - - output: - tuple val(meta), path('*.bam'), emit: bam - tuple val(meta), path('*.log'), emit: log - path '*.version.txt' , emit: version - tuple val(meta), path('*fastq.gz'), optional:true, emit: fastq - - script: - def split_cpus = Math.floor(task.cpus/2) - def software = getSoftwareName(task.process) - def prefix = options.suffix ? "${meta.id}${options.suffix}" : "${meta.id}" - if (meta.single_end) { - def unaligned = params.save_unaligned ? "--un-gz ${prefix}.unmapped.fastq.gz" : '' - """ - INDEX=`find -L ./ -name "*.rev.1.bt2" | sed 's/.rev.1.bt2//'` - bowtie2 \\ - -x \$INDEX \\ - -U $reads \\ - --threads ${split_cpus} \\ - $unaligned \\ - $options.args \\ - 2> ${prefix}.bowtie2.log \\ - | samtools view -@ ${split_cpus} $options.args2 -bhS -o ${prefix}.bam - - - echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//' > ${software}.version.txt - """ - } else { - def unaligned = params.save_unaligned ? "--un-conc-gz ${prefix}.unmapped.fastq.gz" : '' - """ - INDEX=`find -L ./ -name "*.rev.1.bt2" | sed 's/.rev.1.bt2//'` - bowtie2 \\ - -x \$INDEX \\ - -1 ${reads[0]} \\ - -2 ${reads[1]} \\ - --threads ${split_cpus} \\ - $unaligned \\ - $options.args \\ - 2> ${prefix}.bowtie2.log \\ - | samtools view -@ ${split_cpus} $options.args2 -bhS -o ${prefix}.bam - - - if [ -f ${prefix}.unmapped.fastq.1.gz ]; then - mv ${prefix}.unmapped.fastq.1.gz ${prefix}.unmapped_1.fastq.gz - fi - if [ -f ${prefix}.unmapped.fastq.2.gz ]; then - mv ${prefix}.unmapped.fastq.2.gz ${prefix}.unmapped_2.fastq.gz - fi - echo \$(bowtie2 --version 2>&1) | sed 's/^.*bowtie2-align-s version //; s/ .*\$//' > ${software}.version.txt - """ - } -} diff --git a/modules/nf-core/modules/bowtie2/align/meta.yml b/modules/nf-core/modules/bowtie2/align/meta.yml deleted file mode 100644 index 9d9cd004b..000000000 --- a/modules/nf-core/modules/bowtie2/align/meta.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: bowtie2_align -description: Align reads to a reference genome using bowtie2 -keywords: - - align - - fasta - - genome - - reference -tools: - - bowtie2: - description: | - Bowtie 2 is an ultrafast and memory-efficient tool for aligning - sequencing reads to long reference sequences. - homepage: http://bowtie-bio.sourceforge.net/bowtie2/index.shtml - documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml - doi: 10.1038/nmeth.1923 -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. - - index: - type: file - description: Bowtie2 genome index files - pattern: "*.ebwt" -output: - - bam: - type: file - description: Output BAM file containing read alignments - pattern: "*.{bam}" - - version: - type: file - description: File containing software version - pattern: "*.{version.txt}" - - fastq: - type: file - description: Unaligned FastQ files - pattern: "*.fastq.gz" - - log: - type: file - description: Aligment log - pattern: "*.log" -authors: - - "@joseespinosa" - - "@drpatelh" diff --git a/modules/nf-core/modules/prodigal/functions.nf b/modules/nf-core/modules/prodigal/functions.nf deleted file mode 100644 index da9da093d..000000000 --- a/modules/nf-core/modules/prodigal/functions.nf +++ /dev/null @@ -1,68 +0,0 @@ -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - if (!args.filename.endsWith('.version.txt')) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } - } -} diff --git a/modules/nf-core/modules/prodigal/main.nf b/modules/nf-core/modules/prodigal/main.nf deleted file mode 100644 index 36a13c5d1..000000000 --- a/modules/nf-core/modules/prodigal/main.nf +++ /dev/null @@ -1,44 +0,0 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -options = initOptions(params.options) - -process PRODIGAL { - tag "Annotating $meta.id" - label 'process_low' - publishDir "${params.outdir}/${meta.id}", mode: params.publish_dir_mode - - - conda (params.enable_conda ? "bioconda::prodigal=2.6.3" : null) - if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { - container "https://depot.galaxyproject.org/singularity/prodigal:2.6.3--h516909a_2" - } else { - container "quay.io/biocontainers/prodigal:2.6.3--h516909a_2" - } - - input: - tuple val(meta), path(genome) - val(output_format) - - output: - tuple val(meta), path("orfs.${output_format}"), emit: gene_annotations - tuple val(meta), path("orfs.fna"), emit: nucleotide_fasta - tuple val(meta), path("orfs.faa"), emit: amino_acid_fasta - tuple val(meta), path("orfs_all.txt"), emit: all_gene_annotations - path "*.version.txt" , emit: version - - script: - def software = getSoftwareName(task.process) - """ - prodigal -i ${genome} \\ - $options.args \\ - -f $output_format \\ - -d "orfs.fna" \\ - -o "orfs.${output_format}" \\ - -a "orfs.faa" \\ - -s "orfs_all.txt" - - echo \$(prodigal -v 2>&1) | sed -n 's/Prodigal V\\(.*\\):.*/\\1/p' > ${software}.version.txt - """ -} diff --git a/modules/nf-core/modules/prodigal/meta.yml b/modules/nf-core/modules/prodigal/meta.yml deleted file mode 100644 index f20d878e0..000000000 --- a/modules/nf-core/modules/prodigal/meta.yml +++ /dev/null @@ -1,41 +0,0 @@ -name: prodigal -description: write your description here -keywords: - - sort -tools: - - prodigal: - description: Prodigal (Prokaryotic Dynamic Programming Genefinding Algorithm) is a microbial (bacterial and archaeal) gene finding program - homepage: {} - documentation: {} - tool_dev_url: {} - doi: "" - licence: ["GPL v3"] - -input: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - bam: - type: file - description: BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - -output: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - version: - type: file - description: File containing software version - pattern: "*.{version.txt}" - - bam: - type: file - description: Sorted BAM/CRAM/SAM file - pattern: "*.{bam,cram,sam}" - -authors: - - "@grst" diff --git a/modules/nf-core/prodigal/main.nf b/modules/nf-core/prodigal/main.nf new file mode 100644 index 000000000..8a2fe478b --- /dev/null +++ b/modules/nf-core/prodigal/main.nf @@ -0,0 +1,43 @@ +process PRODIGAL { + tag "$meta.id" + label 'process_single' + + conda "bioconda::autometa" + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" + } else { + container "jasonkwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(genome) + val(output_format) + + output: + tuple val(meta), path("${prefix}.${output_format}"), emit: gene_annotations + tuple val(meta), path("${prefix}.fna"), emit: nucleotide_fasta + tuple val(meta), path("${prefix}.faa"), emit: amino_acid_fasta + tuple val(meta), path("${prefix}_all.txt"), emit: all_gene_annotations + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + gzip -cdf ${genome} | prodigal \\ + $args \\ + -f $output_format \\ + -d "${prefix}.fna" \\ + -o "${prefix}.${output_format}" \\ + -a "${prefix}.faa" \\ + -s "${prefix}_all.txt" + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + prodigal: \$(prodigal -v 2>&1 | sed -n 's/Prodigal V\\(.*\\):.*/\\1/p') + END_VERSIONS + """ +} diff --git a/modules/nf-core/prodigal/meta.yml b/modules/nf-core/prodigal/meta.yml new file mode 100644 index 000000000..8cb3d12eb --- /dev/null +++ b/modules/nf-core/prodigal/meta.yml @@ -0,0 +1,55 @@ +name: prodigal +description: Prodigal (Prokaryotic Dynamic Programming Genefinding Algorithm) is a microbial (bacterial and archaeal) gene finding program +keywords: + - sort +tools: + - prodigal: + description: Prodigal (Prokaryotic Dynamic Programming Genefinding Algorithm) is a microbial (bacterial and archaeal) gene finding program + homepage: https://github.com/hyattpd/Prodigal + documentation: https://github.com/hyattpd/prodigal/wiki + tool_dev_url: https://github.com/hyattpd/Prodigal + doi: "10.1186/1471-2105-11-119" + licence: ["GPL v3"] + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - genome: + type: file + description: fasta/fasta.gz file + - output_format: + type: string + description: Output format ("gbk"/"gff"/"sqn"/"sco") + +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - nucleotide_fasta: + type: file + description: nucleotide sequences file + pattern: "*.{fna}" + - amino_acid_fasta: + type: file + description: protein translations file + pattern: "*.{faa}" + - all_gene_annotations: + type: file + description: complete starts file + pattern: "*.{_all.txt}" + - gene_annotations: + type: file + description: gene annotations in output_format given as input + pattern: "*.{output_format}" + +authors: + - "@grst" diff --git a/nextflow.config b/nextflow.config index 9d9e7bad5..fca8f4d8f 100644 --- a/nextflow.config +++ b/nextflow.config @@ -4,8 +4,7 @@ // ***************** manifest { - name = "autometa" - author = "Jason C. Kwan Lab" + name = "KwanLab/autometa" homePage = "https://github.com/KwanLab/Autometa" defaultBranch = "main" description = "Autometa: Automated Extraction of Microbial Genomes from Shotgun Metagenomes" @@ -46,13 +45,17 @@ params { */ taxonomy_aware = false - single_db_dir = null - nr_dmnd_dir = null - prot_accession2taxid_gz_dir = null - taxdump_tar_gz_dir = null + single_db_dir = "autometa_database_directory" + nr_dmnd_dir = "${params.single_db_dir}" + lca_dir = "${params.single_db_dir}" + prot_accession2taxid_gz_dir = "${params.single_db_dir}" + taxdump_tar_gz_dir = "${params.single_db_dir}" large_downloads_permission = false binning_starting_rank = "superkingdom" // choices: "superkingdom", "phylum", "class", "order", "family", "genus", "species" + gtdb_version = "220" + gtdb_dir = "${params.single_db_dir}/gtdb" + use_gtdb = false /* * ------------------------------------------------- * Binning Parameters @@ -70,9 +73,9 @@ params { classification_method = "decision_tree" classification_kmer_pca_dimensions = 50 completeness = 20.0 - purity = 90.0 - gc_stddev_limit = 25.0 - cov_stddev_limit = 5.0 + purity = 95.0 + gc_stddev_limit = 5.0 + cov_stddev_limit = 25.0 unclustered_recruitment = false /* * ------------------------------------------------- @@ -89,13 +92,11 @@ params { validate_params = true show_hidden_params = null schema_ignore_params = 'genomes,modules' - enable_conda = false singularity_pull_docker_container = null // Config options custom_config_version = 'master' custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - hostnames = [:] config_profile_description = null config_profile_contact = null config_profile_url = null @@ -103,12 +104,14 @@ params { // Max resource options // Defaults only, expecting to be overwritten - max_memory = '16.GB' - max_cpus = 4 - max_time = '240.h' - + max_memory = '200.GB' + max_cpus = 12 + max_time = '48.h' } +trace.overwrite = true +dag.overwrite = true + params.tracedir = "${params.outdir}/trace" @@ -131,7 +134,6 @@ profiles { standard { process.executor = "local" docker.enabled = true - docker.userEmulation = true singularity.enabled = false podman.enabled = false shifter.enabled = false @@ -141,31 +143,31 @@ profiles { process.executor = "slurm" // NOTE: You can determine your slurm partition (e.g. process.queue) with the `sinfo` command process.queue = "queue" - docker.enabled = true - docker.userEmulation = true - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false executor { queueSize = 8 } } conda { - params.enable_conda = true + conda.enabled = true docker.enabled = false singularity.enabled = false podman.enabled = false shifter.enabled = false charliecloud.enabled = false + createTimeout = '1 h' } docker { - docker.enabled = true - docker.userEmulation = true - singularity.enabled = false - podman.enabled = false - shifter.enabled = false - charliecloud.enabled = false + docker.enabled = true + conda.enabled = false + singularity.enabled = false + podman.enabled = false + shifter.enabled = false + charliecloud.enabled = false + apptainer.enabled = false + docker.runOptions = '-u $(id -u):$(id -g)' + } + arm { + docker.runOptions = '-u $(id -u):$(id -g) --platform=linux/amd64' } singularity { singularity.enabled = true @@ -198,6 +200,11 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } + + apptainer.registry = 'quay.io' + docker.registry = 'registry.hub.docker.com' + podman.registry = 'quay.io' + singularity.registry = 'quay.io' } // Export these variables to prevent local Python/R libraries from conflicting with those in the container @@ -225,7 +232,7 @@ trace { } dag { enabled = true - file = "${params.outdir}/trace/pipeline_dag_${trace_timestamp}.svg" + file = "${params.outdir}/trace/pipeline_dag_${trace_timestamp}.html" } diff --git a/nextflow_schema.json b/nextflow_schema.json index 0b4f2b2dd..8e38d3062 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -10,17 +10,13 @@ "type": "object", "fa_icon": "fas fa-terminal", "description": "Define where the pipeline should find input data and save output data.", - "required": [ - "input", - "publish_dir_mode" - ], + "required": ["input", "publish_dir_mode"], "properties": { "input": { "type": "string", "fa_icon": "fas fa-file-csv", "description": "Path to input comma-delimited sample sheet(s).", - "help_text": "Use this to specify your inputs' names, metagenomes, reads, coverage table and whether to calculate coverage from the assembly headers or from read alignments.\n\n\nSee https://autometa.readthedocs.io/en/latest/nextflow-workflow.html#sample-sheet-preparation for information on preparing an input sample sheet.", - "default": "" + "help_text": "Use this to specify your inputs' names, metagenomes, reads, coverage table and whether to calculate coverage from the assembly headers or from read alignments.\n\n\nSee https://autometa.readthedocs.io/en/latest/nextflow-workflow.html#sample-sheet-preparation for information on preparing an input sample sheet." }, "outdir": { "type": "string", @@ -110,25 +106,25 @@ }, "completeness": { "type": "number", - "default": 20, + "default": 20.0, "fa_icon": "fas fa-cogs", "description": "Minimum completeness needed to keep a cluster (default is at least 20% complete)" }, "purity": { "type": "number", - "default": 95, + "default": 95.0, "fa_icon": "fas fa-cogs", "description": "Minimum purity needed to keep a cluster (default is at least 95% pure)" }, "gc_stddev_limit": { "type": "number", - "default": 5, + "default": 5.0, "fa_icon": "fas fa-cogs", "description": "Maximum GC% standard deviation under which a cluster is kept (default is 5%)" }, "cov_stddev_limit": { "type": "number", - "default": 25, + "default": 25.0, "fa_icon": "fas fa-cogs", "description": "Maximum coverage standard deviation under which a cluster is kept (default is 25%)" }, @@ -183,14 +179,14 @@ "type": "string", "fa_icon": "fas fa-folder-open", "description": "Currently not used do not set", - "default": "Currently not used do not set", + "default": "autometa_database_directory", "hidden": true }, "taxdump_tar_gz_dir": { "type": "string", "fa_icon": "fas fa-folder-open", "description": "Currently not used do not set", - "default": "Currently not used do not set", + "default": "autometa_database_directory", "hidden": true }, "binning_starting_rank": { @@ -222,27 +218,22 @@ "properties": { "max_cpus": { "type": "integer", - "default": 4, + "default": 12, "fa_icon": "fas fa-microchip", "description": "Max cpus to use/request" }, "max_memory": { "type": "string", - "default": "16 GB", + "default": "200.GB", "fa_icon": "fas fa-memory", "description": "Max RAM to use/request" }, "max_time": { "type": "string", - "default": "240.h", + "default": "48.h", "fa_icon": "fas fa-clock", "description": "Max time a *single* process is allowed to run" }, - "enable_conda": { - "type": "boolean", - "fa_icon": "fas fa-snake", - "description": "Use conda?" - }, "use_run_name": { "type": "boolean", "hidden": true, @@ -269,11 +260,7 @@ "help_text": "Appends input to `jasonkwan/autometa`\n\njasonkwan/autometa:${params.autometa_image_tag}\"" } }, - "required": [ - "max_cpus", - "max_memory", - "max_time" - ] + "required": ["max_cpus", "max_memory", "max_time"] }, "generic_nf_core_options": { "title": "Generic nf-core options", @@ -350,11 +337,6 @@ "default": "https://raw.githubusercontent.com/nf-core/configs/master", "hidden": true }, - "hostnames": { - "type": "string", - "default": "[binac:['.binac.uni-tuebingen.de'], cbe:['.cbe.vbc.ac.at'], cfc:['.hpc.uni-tuebingen.de'], crick:['.thecrick.org'], icr_davros:['.davros.compute.estate'], imperial:['.hpc.ic.ac.uk'], imperial_mb:['.hpc.ic.ac.uk'], genotoul:['.genologin1.toulouse.inra.fr', '.genologin2.toulouse.inra.fr'], genouest:['.genouest.org'], uppmax:['.uppmax.uu.se'], utd_ganymede:['ganymede.utdallas.edu'], utd_sysbio:['sysbio.utdallas.edu']]", - "hidden": true - }, "show_hidden_params": { "type": "string", "hidden": true @@ -364,9 +346,7 @@ "hidden": true } }, - "required": [ - "validate_params" - ] + "required": ["validate_params"] } }, "allOf": [ @@ -385,5 +365,26 @@ { "$ref": "#/definitions/generic_nf_core_options" } - ] + ], + "properties": { + "lca_dir": { + "type": "string", + "default": "autometa_database_directory" + }, + "gtdb_version": { + "type": "integer", + "default": 220 + }, + "gtdb_dir": { + "type": "string", + "default": "autometa_database_directory/gtdb" + }, + "use_gtdb": { + "type": "boolean" + }, + "schema_ignore_params": { + "type": "string", + "default": "genomes,modules" + } + } } diff --git a/setup.py b/setup.py index bd79d9947..fbe7337d1 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ def read(fname): "autometa-download-dataset = autometa.validation.datasets:main", "autometa-cami-format = autometa.validation.cami:main", "autometa-benchmark = autometa.validation.benchmark:main", + "autometa-download-gtdb = autometa.taxonomy.download_gtdb_files:main", "autometa = autometa.__main__:main", ] }, diff --git a/subworkflows/local/binning.nf b/subworkflows/local/binning.nf index ddfd8aedf..2f9d9f918 100644 --- a/subworkflows/local/binning.nf +++ b/subworkflows/local/binning.nf @@ -1,73 +1,150 @@ -params.binning_options = [:] -params.binning_summary_options = [:] -params.taxdump_tar_gz_dir = [:] +include { BINNING } from '../../modules/local/binning' +include { RECRUIT } from '../../modules/local/unclustered_recruitment' +include { BINNING_SUMMARY } from '../../modules/local/binning_summary' -include { BIN_CONTIGS } from './../../modules/local/bin_contigs.nf' addParams( options: params.binning_options ) -include { BINNING_SUMMARY } from './../../modules/local/binning_summary.nf' addParams( options: params.binning_summary_options, taxdump_tar_gz_dir: params.taxdump_tar_gz_dir ) -workflow BINNING { - +workflow BIN { take: - metagenome - kmers_embedded - coverage - gc_content - markers - taxon_assignments - binning_column + filtered_metagenome_fasta + filtered_metagenome_gc_content + markers_ch + coverage_ch + taxonomy_results_ch + taxonomically_split_fna_ch + taxdump_files + dbtype main: - kmers_embedded - .join( - coverage - ) - .join( - gc_content - ) - .join( - markers - ) - .set{metagenome_annotations} - - if (params.taxonomy_aware) { - metagenome_annotations - .join( - taxon_assignments - ) - .set{binning_ch} - } else { - metagenome_annotations - .combine( - taxon_assignments - ) - .set{binning_ch} - } + ch_versions = Channel.empty() + + // has taxon: + // taxonomically_split_fna_ch + // markers_ch + // not has taxon: + // coverage_ch + // filtered_metagenome_gc_content + // taxonomy_results_ch + + // Transform taxonomic-specific channels (keep taxon info) + taxonomically_split_fna_ch = taxonomically_split_fna_ch + .map { meta, files -> + def key = [id: meta.id, taxon: meta.taxon] + [key, files] + } + + markers_ch = markers_ch + .map { meta, files -> + def key = [id: meta.id, taxon: meta.taxon] + [key, files] + } + + // Transform per-sample channels + coverage_ch = coverage_ch + .map { meta, files -> + [meta.id, files] + } + + filtered_metagenome_gc_content = filtered_metagenome_gc_content + .map { meta, files -> + [meta.id, files] + } + + taxonomy_results_ch = taxonomy_results_ch + .map { meta, files -> + [meta.id, files] + } + + // Create branched workflow + workflow_branch = taxonomically_split_fna_ch + .join(markers_ch) + .map { key, kmers, markers -> + // Use the full sample ID as the join key while preserving taxon info + [key.id, [id: key.id, taxon: key.taxon], kmers, markers] + } + .combine(coverage_ch, by: 0) + .combine(filtered_metagenome_gc_content, by: 0) + .combine(taxonomy_results_ch, by: 0) + .map { id, meta, kmers, markers, coverage, gc_content, taxonomy_results -> + // Final structure: [meta with taxon, files...] + [meta, kmers, markers, coverage, gc_content, taxonomy_results] + } + + // Set the output channel + workflow_branch.set { to_bin_ch } - BIN_CONTIGS ( - binning_ch + BINNING( + to_bin_ch ) - BIN_CONTIGS.out.main - .join( - markers - ).join( - metagenome + ch_versions = ch_versions.mix(BINNING.out.versions) + + if (params.unclustered_recruitment) { + // Prepare inputs for recruitment channel + + to_bin_ch + .join(BINNING.out.main) + .set{recruitment_ch} + + RECRUIT( + recruitment_ch ) - .set{binning_summary_ch} + ch_versions = ch_versions.mix(RECRUIT.out.versions) + + RECRUIT.out.main + .set{binning_results_ch} + binning_col = Channel.from("recruited_cluster") + } else { + binning_results_ch = BINNING.out.main + binning_col = Channel.from("cluster") + } + + + // Set inputs for binning summary + binning_results_ch + .map { meta, files -> [meta.subMap(['id']), meta, files] } + .join(markers_ch.map { meta, files -> [meta.subMap(['id']), files] }) + .join(filtered_metagenome_fasta.map { meta, files -> [meta.subMap(['id']), files] }) + .map { it.drop(1) } + .set{binning_summary_input_ch} + + if (params.debug) { + binning_results_ch.view { meta -> + println "binning_results_ch: ${meta}" + } + markers_ch.view { meta -> + println "markers_ch: ${meta}" + } + filtered_metagenome_fasta.view { meta -> + println "filtered_metagenome_fasta: ${meta}" + } + taxdump_files.view { meta -> + println "taxdump_files: ${meta}" + } + markers_ch.view { meta -> + println "markers_ch: ${meta}" + } + binning_col.view { meta -> + println "binning_col: ${meta}" + } + binning_summary_input_ch.view { meta -> + println "binning_summary_input_ch: ${meta}" + } + } - ncbi_tax_dir = file(params.taxdump_tar_gz_dir) + binning_summary_input_ch + .combine(taxdump_files.toList()) + .combine(dbtype) + .combine(binning_col) + .set{binning_summary_input_ch2} - BINNING_SUMMARY ( - binning_summary_ch, - binning_column, - ncbi_tax_dir + BINNING_SUMMARY( + binning_summary_input_ch2 ) + ch_versions = ch_versions.mix(BINNING_SUMMARY.out.versions) emit: - binning = BIN_CONTIGS.out.binning - binning_main = BIN_CONTIGS.out.main - summary_stats = BINNING_SUMMARY.out.stats - summary_taxa = BINNING_SUMMARY.out.taxonomies - metabins = BINNING_SUMMARY.out.metabins + binning_results = binning_results_ch + // binning_summary = BINNING_SUMMARY + versions = ch_versions } diff --git a/subworkflows/local/contig_coverage.nf b/subworkflows/local/calculate_coverage.nf similarity index 74% rename from subworkflows/local/contig_coverage.nf rename to subworkflows/local/calculate_coverage.nf index f52c7db4f..5769533bf 100644 --- a/subworkflows/local/contig_coverage.nf +++ b/subworkflows/local/calculate_coverage.nf @@ -1,39 +1,46 @@ +//TODO: These don't map to anything params.fwd_reads = null params.rev_reads = null params.se_reads = null -params.align_reads_options = [:] -params.samtools_viewsort_options = [:] -params.bedtools_genomecov_options = [:] +include { ALIGN_READS } from '../../modules/local/align_reads' +include { SAMTOOLS_VIEW_AND_SORT } from '../../modules/local/samtools_view_sort' +include { BEDTOOLS_GENOMECOV } from '../../modules/local/bedtools_genomecov' +include { PARSE_BED } from '../../modules/local/parse_bed' -include { ALIGN_READS } from '../../modules/local/align_reads' addParams( options: params.align_reads_options ) -include { SAMTOOLS_VIEW_AND_SORT } from '../../modules/local/samtools_view_sort' addParams( samtools_viewsort_options: params.samtools_viewsort_options ) -include { BEDTOOLS_GENOMECOV } from '../../modules/local/bedtools_genomecov' addParams( options: params.bedtools_genomecov_options ) -include { PARSE_BED } from '../../modules/local/parse_bed' addParams( ) - - -workflow CONTIG_COVERAGE { +workflow CALCULATE_COVERAGE { take: metagenome_reads_ch main: + ch_versions = Channel.empty() + ALIGN_READS( metagenome_reads_ch ) + ch_versions = ch_versions.mix(ALIGN_READS.out.versions) + SAMTOOLS_VIEW_AND_SORT( ALIGN_READS.out.sam ) + ch_versions = ch_versions.mix(SAMTOOLS_VIEW_AND_SORT.out.versions) + BEDTOOLS_GENOMECOV( SAMTOOLS_VIEW_AND_SORT.out.bam ) + ch_versions = ch_versions.mix(BEDTOOLS_GENOMECOV.out.versions) + PARSE_BED(BEDTOOLS_GENOMECOV.out.bed) + ch_versions = ch_versions.mix(PARSE_BED.out.versions) emit: sam = ALIGN_READS.out.sam bam = SAMTOOLS_VIEW_AND_SORT.out.bam bed = BEDTOOLS_GENOMECOV.out.bed coverage = PARSE_BED.out.coverage + versions = ch_versions + } /* diff --git a/subworkflows/local/coverage.nf b/subworkflows/local/coverage.nf new file mode 100644 index 000000000..b7528dad7 --- /dev/null +++ b/subworkflows/local/coverage.nf @@ -0,0 +1,35 @@ +include { CALCULATE_COVERAGE } from './calculate_coverage' +include { SPADES_KMER_COVERAGE } from '../../modules/local/spades_kmer_coverage' + +workflow COVERAGE { + take: + filtered_metagenome_fasta + filtered_metagenome_fasta_and_reads + user_provided_coverage_table + + main: + // meta.cov_from_assembly.equals('0') + + ch_versions = Channel.empty() + + CALCULATE_COVERAGE(filtered_metagenome_fasta_and_reads) + ch_versions = ch_versions.mix(CALCULATE_COVERAGE.out.versions) + + SPADES_KMER_COVERAGE ( + filtered_metagenome_fasta, + ) + ch_versions = ch_versions.mix(SPADES_KMER_COVERAGE.out.versions) + + // https://nextflow-io.github.io/patterns/conditional-process/ + // basically "use input-table coverage, extracted spades coverage, or calculated coverage" + // TODO: this seems + user_provided_coverage_table + .mix(CALCULATE_COVERAGE.out.coverage) + .mix(SPADES_KMER_COVERAGE.out.coverage) + .set{coverage_ch} + + + emit: + coverage_ch = coverage_ch + versions = ch_versions +} diff --git a/subworkflows/local/functions.nf b/subworkflows/local/functions.nf deleted file mode 100644 index 4492f839c..000000000 --- a/subworkflows/local/functions.nf +++ /dev/null @@ -1,99 +0,0 @@ -/* -MIT License - -Copyright (c) 2018 nf-core - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. -*/ - -// -// Utility functions used in nf-core DSL2 module files -// - -// -// Extract name of software tool from process name using $task.process -// -def getSoftwareName(task_process) { - return task_process.tokenize(':')[-1].tokenize('_')[0].toLowerCase() -} - -// -// Function to initialise default values and to generate a Groovy Map of available options for nf-core modules -// -def initOptions(Map args) { - def Map options = [:] - options.args = args.args ?: '' - options.args2 = args.args2 ?: '' - options.args3 = args.args3 ?: '' - options.publish_by_meta = args.publish_by_meta ?: [] - options.publish_dir = args.publish_dir ?: '' - options.publish_files = args.publish_files - options.suffix = args.suffix ?: '' - return options -} - -// -// Tidy up and join elements of a list to return a path string -// -def getPathFromList(path_list) { - def paths = path_list.findAll { item -> !item?.trim().isEmpty() } // Remove empty entries - paths = paths.collect { it.trim().replaceAll("^[/]+|[/]+\$", "") } // Trim whitespace and trailing slashes - return paths.join('/') -} - -// -// Function to save/publish module results -// -def saveFiles(Map args) { - if (!args.filename.endsWith('.version.txt')) { - def ioptions = initOptions(args.options) - def path_list = [ ioptions.publish_dir ?: args.publish_dir ] - if (ioptions.publish_by_meta) { - def key_list = ioptions.publish_by_meta instanceof List ? ioptions.publish_by_meta : args.publish_by_meta - for (key in key_list) { - if (args.meta && key instanceof String) { - def path = key - if (args.meta.containsKey(key)) { - path = args.meta[key] instanceof Boolean ? "${key}_${args.meta[key]}".toString() : args.meta[key] - } - path = path instanceof String ? path : '' - path_list.add(path) - } - } - } - if (ioptions.publish_files instanceof Map) { - for (ext in ioptions.publish_files) { - if (args.filename.endsWith(ext.key)) { - def ext_list = path_list.collect() - ext_list.add(ext.value) - return "${getPathFromList(ext_list)}/$args.filename" - } - } - } else if (ioptions.publish_files == null) { - return "${getPathFromList(path_list)}/$args.filename" - } - } -} - -/* - * Check file extension - */ -def hasExtension(it, extension) { - it.toString().toLowerCase().endsWith(extension.toLowerCase()) -} diff --git a/subworkflows/local/genome_coverage.nf b/subworkflows/local/genome_coverage.nf deleted file mode 100644 index 4843e04b0..000000000 --- a/subworkflows/local/genome_coverage.nf +++ /dev/null @@ -1,29 +0,0 @@ -params.bedtools_genomecov_options = [:] - -include { BEDTOOLS_GENOMECOV } from './../../modules/nf-core/modules/bedtools/genomecov.nf' addParams( options: params.bedtools_genomecov_options ) - -workflow GENOME_COVERAGE { - take: - bam // channel: [ val(meta), path(bam) ] - lengths // channel: [ val(meta), path(lengths) ] // https://bedtools.readthedocs.io/en/latest/content/general-usage.html#genome-file-format - - main: - bedtools_input_ch = bam.combine(lengths) - - BEDTOOLS_GENOMECOV ( - bedtools_input_ch - ) - - bam.out.bed - .combine(lengths) - .combine(BEDTOOLS_GENOMECOV.out.bed) - .set{parse_bed_input_ch} - - PARSE_BED ( - parse_bed_input_ch - ) - - emit: - bed = BEDTOOLS_GENOMECOV.out.bed - coverage = PARSE_BED.out.coverage -} diff --git a/subworkflows/local/input_check.nf b/subworkflows/local/input_check.nf index 7846ddd57..3c3e5dfbc 100644 --- a/subworkflows/local/input_check.nf +++ b/subworkflows/local/input_check.nf @@ -4,37 +4,41 @@ nextflow.enable.dsl=2 -params.options = [:] -include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' addParams( options: params.options ) + +include { SAMPLESHEET_CHECK } from '../../modules/local/samplesheet_check' workflow INPUT_CHECK { take: samplesheet // file: /path/to/samplesheet.csv main: + ch_versions = Channel.empty() + SAMPLESHEET_CHECK ( samplesheet ) + ch_versions = ch_versions.mix(SAMPLESHEET_CHECK.out.versions) // reads channel - SAMPLESHEET_CHECK.out + SAMPLESHEET_CHECK.out.csv .splitCsv ( header:true, sep:',' ) .map { create_fastq_channel(it) } .set { reads } // metagenome channel - SAMPLESHEET_CHECK.out + SAMPLESHEET_CHECK.out.csv .splitCsv ( header:true, sep:',' ) .map { create_metagenome_channel(it) } .set { metagenome } // coverage channel - SAMPLESHEET_CHECK.out + SAMPLESHEET_CHECK.out.csv .splitCsv ( header:true, sep:',' ) .map { create_coverage_channel(it) } .set { coverage } emit: - reads // channel: [ val(meta), [ reads ] ] - metagenome // channel: [ val(meta), [ assembly ]] - coverage // channel: [ val(meta), [ coverage ]] + reads = reads // channel: [ val(meta), [ reads ] ] + metagenome = metagenome // channel: [ val(meta), [ assembly ]] + coverage = coverage // channel: [ val(meta), [ coverage ]] + versions = ch_versions } diff --git a/subworkflows/local/kmers.nf b/subworkflows/local/kmers.nf index b9f9b057c..ca11dc1c6 100644 --- a/subworkflows/local/kmers.nf +++ b/subworkflows/local/kmers.nf @@ -1,19 +1,29 @@ -include { COUNT_KMERS as COUNT } from '../../modules/local/count_kmers' addParams( options: params.count_kmers_options ) -include { NORMALIZE_KMERS as NORMALIZE } from '../../modules/local/normalize_kmers' addParams( options: params.normalize_kmers_options ) -include { EMBED_KMERS as EMBED } from '../../modules/local/embed_kmers' addParams( options: params.embed_kmers_options ) +include { COUNT_KMERS as COUNT } from '../../modules/local/count_kmers' +include { NORMALIZE_KMERS as NORMALIZE } from '../../modules/local/normalize_kmers' +include { EMBED_KMERS as EMBED } from '../../modules/local/embed_kmers' workflow KMERS { take: fasta main: + ch_versions = Channel.empty() + COUNT(fasta) + ch_versions = ch_versions.mix(COUNT.out.versions) + NORMALIZE(COUNT.out.counts) + ch_versions = ch_versions.mix(NORMALIZE.out.versions) + EMBED(NORMALIZE.out.normalized) + ch_versions = ch_versions.mix(EMBED.out.versions) + emit: counts = COUNT.out.counts normalized = NORMALIZE.out.normalized embedded = EMBED.out.embedded + versions = ch_versions + } /* diff --git a/subworkflows/local/lca.nf b/subworkflows/local/lca.nf index f5ced39f0..d697a9ce2 100644 --- a/subworkflows/local/lca.nf +++ b/subworkflows/local/lca.nf @@ -1,34 +1,43 @@ #!/usr/bin/env nextflow nextflow.enable.dsl=2 -params.prepare_lca_options = [:] -params.reduce_lca_options = [:] - -include { PREPARE_LCA as PREP_DBS } from './../../modules/local/prepare_lca.nf' addParams( options: params.prepare_lca_options ) -include { REDUCE_LCA as REDUCE } from './../../modules/local/reduce_lca.nf' addParams( options: params.reduce_lca_options ) +include { PREPARE_LCA as PREP_DBS } from './../../modules/local/prepare_lca.nf' +include { REDUCE_LCA as REDUCE } from './../../modules/local/reduce_lca.nf' workflow LCA { take: blastp_results - blastp_dbdir + taxdump_files + prot_accession2taxid + dbtype main: + ch_versions = Channel.empty() + PREP_DBS( - blastp_dbdir + taxdump_files, + dbtype ) + ch_versions = ch_versions.mix(PREP_DBS.out.versions) + REDUCE( blastp_results, - blastp_dbdir, - PREP_DBS.out.cache + taxdump_files, + PREP_DBS.out.cache, + prot_accession2taxid.toList(), + dbtype ) + ch_versions = ch_versions.mix(REDUCE.out.versions) + emit: lca = REDUCE.out.lca error_taxid = REDUCE.out.error_taxids sseqid_to_taxids = REDUCE.out.sseqid_to_taxids cache = PREP_DBS.out.cache + versions = ch_versions } /* diff --git a/subworkflows/local/mock_data.nf b/subworkflows/local/mock_data.nf index eec9efd91..27cb0e4f0 100644 --- a/subworkflows/local/mock_data.nf +++ b/subworkflows/local/mock_data.nf @@ -1,13 +1,10 @@ -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.get_genomes_for_mock = [:] -include { GET_GENOMES_FOR_MOCK } from './../../modules/local/get_genomes_for_mock.nf' addParams( options: params.get_genomes_for_mock ) +include { GET_GENOMES_FOR_MOCK } from './../../modules/local/get_genomes_for_mock.nf' process SAMTOOLS_WGSIM { // This process is used to create simulated reads from an input FASTA file label 'process_low' - conda (params.enable_conda ? "bioconda::samtools=1.13" : null) + conda "bioconda::samtools=1.13" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/samtools:1.12--hd5e65b6_0" } else { @@ -15,58 +12,65 @@ process SAMTOOLS_WGSIM { } input: - path fasta + tuple val(meta), path(metagenome) output: - path("*.fastq"), emit: fastq - path "*.version.txt" , emit: version + tuple val(meta), path("reads_1.fastq"), path("reads_2.fastq"), emit: reads + path "versions.yml" , emit: versions """ # https://sarahpenir.github.io/bioinformatics/Simulating-Sequence-Reads-with-wgsim/ - wgsim -1 300 -2 300 -r 0 -R 0 -X 0 -e 0 ${fasta} reads_1.fastq reads_2.fastq + wgsim -1 300 -2 300 -r 0 -R 0 -X 0 -e 0 ${metagenome} reads_1.fastq reads_2.fastq + - echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//' > samtools.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + END_VERSIONS """ } workflow CREATE_MOCK { main: + ch_versions = Channel.empty() + // Download and format fasta files from specfied whole genome assemblies (genomes set from "get_genomes_for_mock" parameter in ~Autometa/conf/modules.config) GET_GENOMES_FOR_MOCK() - - // Create fake reads from input genome sequences - SAMTOOLS_WGSIM(GET_GENOMES_FOR_MOCK.out.metagenome) - + ch_versions = ch_versions.mix(GET_GENOMES_FOR_MOCK.out.versions) // Format everything with a meta map for use in the main Autometa pipeline - GET_GENOMES_FOR_MOCK.out.fake_spades_coverage + // see "create_" functions in ~subworkflows/local/input_check.nf + GET_GENOMES_FOR_MOCK.out.assembly_to_locus .map { row -> def meta = [:] meta.id = "mock_data" - meta.cov_from_assembly = "spades" return [ meta, row ] } - .set { ch_fasta } - GET_GENOMES_FOR_MOCK.out.assembly_to_locus + .set { assembly_to_locus } + GET_GENOMES_FOR_MOCK.out.assembly_report .map { row -> def meta = [:] meta.id = "mock_data" - meta.cov_from_assembly = "spades" return [ meta, row ] } - .set { assembly_to_locus } - GET_GENOMES_FOR_MOCK.out.assembly_report + .set { assembly_report } + + GET_GENOMES_FOR_MOCK.out.metagenome .map { row -> def meta = [:] meta.id = "mock_data" - meta.cov_from_assembly = "spades" return [ meta, row ] } - .set { assembly_report } + .set { metagenome } + + // Create fake reads from input genome sequences + SAMTOOLS_WGSIM(metagenome) + ch_versions = ch_versions.mix(SAMTOOLS_WGSIM.out.versions) emit: - fasta = ch_fasta - reads = SAMTOOLS_WGSIM.out.fastq + reads = SAMTOOLS_WGSIM.out.reads + fasta = metagenome assembly_to_locus = assembly_to_locus assembly_report = assembly_report + versions = ch_versions } diff --git a/subworkflows/local/prepare_gtdb.nf b/subworkflows/local/prepare_gtdb.nf new file mode 100644 index 000000000..3501a27a8 --- /dev/null +++ b/subworkflows/local/prepare_gtdb.nf @@ -0,0 +1,60 @@ + +include { DIAMOND_MAKEDB as GTDB_MAKEDB } from './../../modules/local/diamond_makedb.nf' + + +process DOWNLOAD_GTDB { + tag "Downloading GTDB database version ${params.gtdb_version}" + label 'process_low' + storeDir "${params.gtdb_dir}" + + conda "bioconda::autometa" + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" + } else { + container "jasonkwan/autometa:${params.autometa_image_tag}" + } + + output: + path 'autometa_formatted_gtdb-version-*.faa.gz' , emit: gtdb_formated_faa + path 'gtdb_taxdump-version-*/*' , emit: gtdb_taxdump_directory + path "versions.yml" , emit: versions + + script: + """ + autometa-download-gtdb --version $params.gtdb_version --outdir '.' + + rm gtdb-taxdump-version-*.tar.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + gtdb: $params.gtdb_version + END_VERSIONS + """ +} + +workflow PREPARE_GTDB_DB { + + main: + ch_versions = Channel.empty() + + // if use_gtdb and large_downloads_permission is set to true, download the GTDB database + if (params.use_gtdb && params.large_downloads_permission) { + + DOWNLOAD_GTDB() + ch_versions = ch_versions.mix(DOWNLOAD_GTDB.out.versions) + + // get the single gtdb_formated_faa file and create the string e.g. autometa_formatted_gtdb-version-220.db from autometa_formatted_gtdb-version-220.faa.gz + dbname = DOWNLOAD_GTDB.out.gtdb_formated_faa.getName().replaceFirst(/\.gz$/, '').replaceFirst(/\.faa$/, '.dmnd') + GTDB_MAKEDB(DOWNLOAD_GTDB.out.gtdb_formated_faa, dbname) + ch_versions = ch_versions.mix(GTDB_MAKEDB.out.versions) + + } else { + println '\033[0;34m `--large_downloads_permission` is set to false. Skipping GTDB database download. \033[0m' + } + + emit: + diamond_db = GTDB_MAKEDB.out.diamond_db + gtdb_taxdump_directory = DOWNLOAD_GTDB.out.gtdb_taxdump_directory + versions = ch_versions +} diff --git a/subworkflows/local/prepare_ncbi_taxinfo.nf b/subworkflows/local/prepare_ncbi_taxinfo.nf index 5b6737393..8c406ca6a 100644 --- a/subworkflows/local/prepare_ncbi_taxinfo.nf +++ b/subworkflows/local/prepare_ncbi_taxinfo.nf @@ -1,11 +1,5 @@ // this file probably needs to be reevaluated, but from a python-first // perspective since the python code assumes file/directory structure -include { initOptions; saveFiles; getSoftwareName } from './functions' - -params.options = [:] -params.taxdump_tar_gz_dir = [:] -params.prot_accession2taxid_gz_dir = [:] -options = initOptions(params.options) process TEST_DOWNLOAD { // For development work so you don't download the entire prot.accession2taxid.gz database @@ -13,7 +7,7 @@ process TEST_DOWNLOAD { label 'process_low' storeDir "${params.prot_accession2taxid_gz_dir}" - conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) + conda "conda-forge::rsync=3.2.3" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -23,6 +17,9 @@ process TEST_DOWNLOAD { output: path("prot.accession2taxid"), emit: singlefile + when: + task.ext.when == null || task.ext.when + script: """ # https://github.com/nextflow-io/nextflow/issues/1564 @@ -36,7 +33,7 @@ process DOWNLOAD_ACESSION2TAXID { label 'process_low' storeDir "${params.prot_accession2taxid_gz_dir}" - conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) + conda "conda-forge::rsync=3.2.3" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -46,7 +43,10 @@ process DOWNLOAD_ACESSION2TAXID { output: // hack nf-core options.args3 and use for output name path "prot.accession2taxid.gz" , emit: accession2taxid - path "*.version.txt" , emit: version + path "versions.yml" , emit: versions + when: + task.ext.when == null || task.ext.when + script: """ rsync -a \\ @@ -59,17 +59,18 @@ process DOWNLOAD_ACESSION2TAXID { md5sum -c *.md5 - rsync --version | head -n1 > rsync.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rsync: \$(rsync --version | head -n1 | sed 's/^rsync version //' | sed 's/\s.*//') + END_VERSIONS """ } - process DOWNLOAD_TAXDUMP { tag "Downloading taxdump.tar.gz" label 'process_low' - storeDir "${params.taxdump_tar_gz_dir}" - conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) + conda "conda-forge::rsync=3.2.3" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -77,8 +78,11 @@ process DOWNLOAD_TAXDUMP { } output: - path "*" , emit: taxdump_files - path "*.version.txt" , emit: version + path "*.dmp" , emit: taxdump_files + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: """ @@ -95,42 +99,53 @@ process DOWNLOAD_TAXDUMP { tar -xf taxdump.tar.gz rm taxdump.tar.gz - rsync --version | head -n1 > rsync.version.txt + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rsync: \$(rsync --version | head -n1 | sed 's/^rsync version //' | sed 's/\s.*//') + END_VERSIONS """ } - workflow PREPARE_TAXONOMY_DATABASES { main: + ch_versions = Channel.empty() + taxdump_dir = file(params.taxdump_tar_gz_dir) taxdump_dir_files = taxdump_dir.list() expected_files = ['citations.dmp', 'delnodes.dmp', 'division.dmp', 'gencode.dmp', 'merged.dmp', 'names.dmp', 'nodes.dmp'] - if (taxdump_dir_files.containsAll(expected_files)){ - taxdump_files = taxdump_dir_files + dmp_files = file("${params.taxdump_tar_gz_dir}/*.dmp") + taxonomy_files_exist = dmp_files.name.containsAll(expected_files) + + if (taxonomy_files_exist){ + taxdump_files = dmp_files } else { DOWNLOAD_TAXDUMP() + ch_versions = ch_versions.mix(DOWNLOAD_TAXDUMP.out.versions) DOWNLOAD_TAXDUMP.out.taxdump_files .set{taxdump_files} } - accession2taxid_dir = file(params.prot_accession2taxid_gz_dir) - accession2taxid_dir_files = accession2taxid_dir_files.list() - expected_files = ['prot.accession2taxid'] + taxonomy_files_exist2 = file("${params.prot_accession2taxid_gz_dir}/prot.accession2taxid.gz") - if (accession2taxid_dir_files.containsAll(expected_files)){ - prot_accession2taxid_ch = accession2taxid_dir_files + if (taxonomy_files_exist2.exists()){ + prot_accession2taxid_ch = taxonomy_files_exist2 } else if (params.debug){ TEST_DOWNLOAD().singlefile .set{prot_accession2taxid_ch} + } else { - DOWNLOAD_ACESSION2TAXID().accession2taxid + DOWNLOAD_ACESSION2TAXID() + DOWNLOAD_ACESSION2TAXID.out.accession2taxid .set{prot_accession2taxid_ch} + ch_versions = ch_versions.mix(DOWNLOAD_ACESSION2TAXID.out.versions) + } emit: - taxdump = taxdump_files + taxdump_files = taxdump_files prot_accession2taxid = prot_accession2taxid_ch + versions = ch_versions } diff --git a/subworkflows/local/prepare_nr.nf b/subworkflows/local/prepare_nr.nf index 0d1de4b68..36c58de06 100644 --- a/subworkflows/local/prepare_nr.nf +++ b/subworkflows/local/prepare_nr.nf @@ -1,20 +1,14 @@ -// Import generic module functions -include { initOptions; saveFiles; getSoftwareName } from './functions' -params.options = [:] -options = initOptions(params.options) - -params.diamond_makedb_options = [:] -params.nr_dmnd_dir = [:] - -include { DIAMOND_MAKEDB } from './../../modules/local/diamond_makedb.nf' addParams( options: params.diamond_makedb_options, nr_dmnd_dir: params.nr_dmnd_dir) +include { DIAMOND_MAKEDB } from './../../modules/local/diamond_makedb.nf' process DOWNLOAD_NR { tag "Downloading nr.gz (>100GB download. May take some time.)" label 'process_low' - storeDir "${params.nr_dmnd_dir}" + label 'process_long' + + println '\033[0;34m Downloading nr.gz from NCBI, this may take a long time. \033[0m' - conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) + conda "bioconda::autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -22,29 +16,34 @@ process DOWNLOAD_NR { } output: - path("nr.gz"), emit: singlefile + path("nr.gz") , emit: singlefile + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when script: - """ - rsync -a \\ - --quiet \\ - 'rsync://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz' 'nr.gz' + def args = task.ext.args ?: '-x 8 -s 8' - rsync -a \\ - --quiet \\ - 'rsync://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5' 'nr.gz.md5' + """ + aria2c ${args} 'https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz' + aria2c 'https://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz.md5' md5sum -c *.md5 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + rsync: \$(rsync --version | head -n1 | sed 's/^rsync version //' | sed 's/\s.*//') + END_VERSIONS """ } - process TEST_DOWNLOAD { // For development work so you don't download the entire nr.gz database - tag "Downloading first 10,000 lines of nr.gz" + tag "Downloading small set of FASTA" label 'process_low' - conda (params.enable_conda ? "conda-forge::rsync=3.2.3" : null) + conda "bioconda::autometa" if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" } else { @@ -52,41 +51,85 @@ process TEST_DOWNLOAD { } output: - path("nr.gz"), emit: singlefile + path "nr.gz", emit: singlefile + + when: + task.ext.when == null || task.ext.when script: """ - # https://github.com/nextflow-io/nextflow/issues/1564 - trap 'echo OK; exit 0;' EXIT - curl -s ftp://ftp.ncbi.nlm.nih.gov/blast/db/FASTA/nr.gz | zcat | head -n 10000 | gzip > nr.gz + +cat <<-END_VERSIONS > nr +>KJX92028.1 hypothetical protein TI39_contig5958g00003 [Zymoseptoria brevis] +MAWTRQLVPLMLLFCGAHGLQRSSTATDQLSNSALQALGSHADLAAFVNDVEAVPEIANVILAHRGITIMAPVDSAWLRV +DAIKRRNPAFLAWHIMNANVLTSDVPLVQYEQHPGITIPTFLSGSKNWTYSGEPASLISGGQSLTAITLKTEDNVIWVSG +ASNVSYIKQANISYDRGIIHKIDPALQFPTSAYETAFAVGLYSYCWAVFTAGLDQEIRRIPNSTFLLPINEAFHAALPFL +LGASREEFKRIVYRHVIPGRVLWSHEFYNASHETFEGSIVQIRGGNGRRWFVDDAMILDGSDKPLYNGVGHVVNKVLLPT +>EFG1759503.1 decarboxylating NADP(+)-dependent phosphogluconate dehydrogenase [Escherichia coli]EGJ4377881.1 decarboxylating NADP(+)-dependent phosphogluconate dehydrogenase [Escherichia coli] +LKPYLDKGDIIIDGGNTFFQDTIRRNRELSAEGFNFIGTGVSGGEEGALKGPSIMPGGQKEAYELVAPILTKIAAVAEDG +EPCVTYIGADGAGHYVKMVHNGIEYGDMQLIAEAYSLLKGGLNLTNEELAQTFTEWNNGELSSYLIDITKDIFTKKDEDG +NYLVDVILDEAANKGTGKWTSQSALDLGEPLSLITESVFARYISSLKEQRVAASKVLSGPQAQPAGDKGEFIEKVRRALY +LGKIVSYAQGFSQLRAASEEYNWDLNYGEIAKIFRAGCIIRAQFLQKITDAYIENPQIANLLLAPYFKQIADNYQQALRE +VVAYAVQNGIPVPTFAAAVAYYDSYRAAVLPANLIQAQRDYFGAHTYKRIDKEGVFHTEWL +>WP_198835266.1 pilus assembly protein [Paracoccus sp. IB05]MBJ2149627.1 pilus assembly protein [Paracoccus sp. IB05] +MTWRPLQRFLTRSDAAVTAEFVIVFPLVLALIFLIVFISMYISAASDLQQVVHELARYSYRYAGRPEANQLCATLERDAV +PILVNASLLLHPENFTLISCSPPQGPDRIIVITASYDFAGSFVQSVGRTLGLSIGTISRQSLFIP +>MBD3193859.1 hypothetical protein [Candidatus Lokiarchaeota archaeon]MBD3198741.1 hypothetical protein [Candidatus Lokiarchaeota archaeon] +MKKGFIVLILIALVSAGGLILFFYYSNDSGNGNFNTNSEKMIINHNHAHLEDFTSIPSEWIIAAKANLSIVYWHTSHGSQ +ITTGMSLLDAFMGDNDVYEFNNAGTGGALHYHEPSIDYSRRDLTGYTDQFDDETRTFLSSNPEYNVVIWSWCGLDKNNAS +INAYLTNMNQLESEYPNVHFVYMTAHLEGTGEDGDLHIYNQMIRRYCNKNNKTLYDFGDIESYNPENEYFLDRDANDGCY +YDSDGNGSLDANWATEWQSTHDGTHTYPNGGEWYDCSPAHSEAVNGNLKAYAAWYLFARLAGWNGT +>UMM52736.1 protein ORF58 [Lake sturgeon herpesvirus] +MGSMVKKRSRSLIPTSSITRWKTQSLKRPKATCASLRLTPRSTLSPQCHAGYGQSSPGANGLNRPVIDTWTRPSTAFGPS +TSLGWTPQTHIFLNGNFVSHTHGCSPAFFTATQHVNIVYNKKQQTSVFAPHLLPHKQIQSGTVLTDNNKFVTDKKKTFSV +QGVKNTRIEFTSLKNRSSNYTTNCRPLYQPAFQQFFELTGLCHGETSVTMSAMVVNNVNYTTCLYGLTNPFSFNFKICKD +HKKFHNTLFFPSVNLYKQAKGRQHQIFESRYINSQKIYPGDVNQFGFYLQTVVAQTEYDPCLNWYFCRHFEATKSFLNTP +NKTLILWFNERFYLAHPQVDIADPASYWPAYVTFMDLCVTPHLNHFIGFFSSGFGQYHNKNPEFIHLIPFLIFGAARGHN +QGLDLIASYAHRLSRLQRHESLLELRLILQIAVELLKNPQITLCDDPVRGMELSYPQSDDPDNDREKRAKKRRLVVVTKP +LCPPATVVRPLAGHQQSLVKKIQVYCQTCRRG +END_VERSIONS + +gzip nr + """ } workflow PREPARE_NR_DB { main: + ch_versions = Channel.empty() + + // TODO: this if/else can be simplified if (file("${params.nr_dmnd_dir}/nr.dmnd").exists()){ // skip huge download and db creation if nr.dmnd already exists out_ch = file("${params.nr_dmnd_dir}/nr.dmnd") } else if (file("${params.nr_dmnd_dir}/nr.gz").exists()){ // skip huge download if nr.gz already exists DIAMOND_MAKEDB(file("${params.nr_dmnd_dir}/nr.gz"), "nr") - DIAMOND_MAKEDB.out.diamond_db - .set{out_ch} + ch_versions = ch_versions.mix(DIAMOND_MAKEDB.out.versions) + out_ch = DIAMOND_MAKEDB.out.diamond_db + } else if (params.debug){ TEST_DOWNLOAD().singlefile .set{nr_db_ch} + DIAMOND_MAKEDB(nr_db_ch, "nr") - DIAMOND_MAKEDB.out.diamond_db - .set{out_ch} - } else { + ch_versions = ch_versions.mix(DIAMOND_MAKEDB.out.versions) + out_ch = DIAMOND_MAKEDB.out.diamond_db + + } else if (params.large_downloads_permission) { DOWNLOAD_NR().singlefile .set{nr_db_ch} + ch_versions = ch_versions.mix(DOWNLOAD_NR.out.versions) DIAMOND_MAKEDB(nr_db_ch, "nr") - DIAMOND_MAKEDB.out.diamond_db - .set{out_ch} + ch_versions = ch_versions.mix(DIAMOND_MAKEDB.out.versions) + out_ch = DIAMOND_MAKEDB.out.diamond_db + + } else { + println '\033[0;34m Neither nr.dmnd or nr.gz were found and `--large_downloads_permission` is set to false. \033[0m' } emit: diamond_db = out_ch + versions = ch_versions } diff --git a/subworkflows/local/process_metagenome.nf b/subworkflows/local/process_metagenome.nf new file mode 100644 index 000000000..4bb7a7da4 --- /dev/null +++ b/subworkflows/local/process_metagenome.nf @@ -0,0 +1,74 @@ +include { CREATE_MOCK } from './mock_data' +include { INPUT_CHECK } from './input_check' +include { SEQKIT_FILTER } from '../../modules/local/seqkit_filter' + +workflow PROCESS_METAGENOME { + + main: + + ch_versions = Channel.empty() + + // Samplesheet channel + Channel + .fromPath(params.input) + .set{samplesheet_ch} + + assembly_to_locus = Channel.empty() + assembly_report = Channel.empty() + + // Set the metagenome and coverage channels + if (params.mock_test){ + + CREATE_MOCK() + ch_versions = ch_versions.mix(CREATE_MOCK.out.versions) + + CREATE_MOCK.out.fasta + .set{metagenome_ch} + + Channel + .empty() + .set{user_provided_coverage_table} + + CREATE_MOCK.out.reads + .set{reads_ch} + + assembly_to_locus = CREATE_MOCK.out.assembly_to_locus + assembly_report = CREATE_MOCK.out.assembly_report + + } else { + + INPUT_CHECK(samplesheet_ch) + ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + + INPUT_CHECK.out.metagenome + .set{metagenome_ch} + + INPUT_CHECK.out.coverage + .set{user_provided_coverage_table} + + INPUT_CHECK.out.reads + .set{reads_ch} + + + } + + SEQKIT_FILTER( + metagenome_ch + ) + ch_versions = ch_versions.mix(SEQKIT_FILTER.out.versions) + + SEQKIT_FILTER.out.fasta + .join(reads_ch) + .set{combined_contigs_reads} + + emit: + raw_metagenome_fasta = metagenome_ch + filtered_metagenome_fasta = SEQKIT_FILTER.out.fasta + user_provided_coverage_table = user_provided_coverage_table + reads = reads_ch + filtered_metagenome_fasta_and_reads = combined_contigs_reads + filtered_metagenome_gc_content = SEQKIT_FILTER.out.gc_content + assembly_to_locus = assembly_to_locus + assembly_report = assembly_report + versions = ch_versions +} diff --git a/subworkflows/local/taxon_assignment.nf b/subworkflows/local/taxon_assignment.nf deleted file mode 100644 index f02714777..000000000 --- a/subworkflows/local/taxon_assignment.nf +++ /dev/null @@ -1,92 +0,0 @@ -params.prepare_lca_options = [:] -params.reduce_lca_options = [:] -params.majority_vote_options = [:] -params.split_kingdoms_options = [:] -params.nr_dmnd_dir = [:] -params.taxdump_tar_gz_dir = [:] -params.prot_accession2taxid_gz_dir = [:] -params.diamond_blastp_options = [:] - -params.debug = [:] -params.diamond_makedb_options = [:] -params.large_downloads_permission = [:] - - -include { PREPARE_NR_DB } from './prepare_nr.nf' addParams( debug: params.debug, diamond_makedb_options: params.diamond_makedb_options, nr_dmnd_dir: params.nr_dmnd_dir ) -include { PREPARE_TAXONOMY_DATABASES } from './prepare_ncbi_taxinfo.nf' addParams( debug: params.debug, taxdump_tar_gz_dir: params.taxdump_tar_gz_dir, prot_accession2taxid_gz_dir: params.prot_accession2taxid_gz_dir ) -include { LCA } from './lca.nf' addParams( prepare_lca_options: params.prepare_lca_options, reduce_lca_options: params.reduce_lca_options ) -include { MAJORITY_VOTE } from './../../modules/local/majority_vote.nf' addParams( options: params.majority_vote_options ) -include { SPLIT_KINGDOMS } from './../../modules/local/split_kingdoms.nf' addParams( options: params.split_kingdoms_options ) -include { DIAMOND_BLASTP } from './../../modules/local/diamond_blastp.nf' addParams( options: params.diamond_blastp_options ) - - -// Autometa taxon assignment workflow -workflow TAXON_ASSIGNMENT { - take: - metagenome - merged_prodigal - - main: - // check if user has given permission for large downloads - if (params.large_downloads_permission) { - // Download and prep necessary databases - PREPARE_NR_DB() - PREPARE_NR_DB.out.diamond_db - .set{diamond_db} - PREPARE_TAXONOMY_DATABASES() - PREPARE_TAXONOMY_DATABASES.out.taxdump - .set{ncbi_taxdump} - PREPARE_TAXONOMY_DATABASES.out.prot_accession2taxid - .set{prot_accession2taxid} - } else { - // check for nr.dmnd, if not found, check for nr.gz - // if nr.gz exists, create nr.dmnd - // if nr.gz also doesn't exist, stop the pipeline - if (!file("${params.nr_dmnd_dir}/nr.dmnd").exists()) { - if (file("${params.nr_dmnd_dir}/nr.gz").exists()) { - PREPARE_NR_DB() - PREPARE_NR_DB.out.diamond_db - .set{diamond_db} - } else { - throw new Exception("Neither nr.dmnd or nr.gz was found") - } - } else { - diamond_db = file("${params.nr_dmnd_dir}/nr.dmnd", checkIfExists: true) - } - } - - DIAMOND_BLASTP ( - merged_prodigal, - diamond_db - ) - - ncbi_tax_dir = file(params.taxdump_tar_gz_dir) - - LCA ( - DIAMOND_BLASTP.out.diamond_results, - ncbi_tax_dir - ) // output '${blast.simpleName}.lca.tsv' - - MAJORITY_VOTE ( - LCA.out.lca, - ncbi_tax_dir - ) //output ${lca.simpleName}.votes.tsv - - metagenome - .join( - MAJORITY_VOTE.out.votes - ) - .set{split_kingdoms_input} - - SPLIT_KINGDOMS ( - split_kingdoms_input, - ncbi_tax_dir - ) - - emit: - taxonomy = SPLIT_KINGDOMS.out.taxonomy - bacteria = SPLIT_KINGDOMS.out.bacteria - archaea = SPLIT_KINGDOMS.out.archaea - orf_votes = LCA.out.lca - contig_votes = MAJORITY_VOTE.out.votes -} diff --git a/subworkflows/local/taxon_assignment_gtdb.nf b/subworkflows/local/taxon_assignment_gtdb.nf new file mode 100644 index 000000000..3da416f5b --- /dev/null +++ b/subworkflows/local/taxon_assignment_gtdb.nf @@ -0,0 +1,109 @@ + +include { PREPARE_GTDB_DB } from './prepare_gtdb.nf' +include { TAXON_SPLIT } from './taxon_split.nf' + +process EXTRACT_ORFS { + tag "Extracting ORFs from taxon-assigned metagenome contigs" + label 'process_low' + + conda "bioconda::autometa" + if (workflow.containerEngine == 'singularity' && !params.singularity_pull_docker_container) { + container "https://depot.galaxyproject.org/singularity/autometa:2.2.0--pyh7cba7a3_0" + } else { + container "jasonkwan/autometa:${params.autometa_image_tag}" + } + + input: + tuple val(meta), path(contigs), path (orfs) + + output: + tuple val(meta), path("${meta.id}.gtdb_input.fna"), path("${meta.id}_gtdb_input_orfs.faa.gz"), emit: split_orfs + path "versions.yml", emit: versions + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + grep -h ">" $contigs | \\ + sed 's/^>//' | \\ + cut -f1 -d" " | \\ + sed 's/\\\$/_/' | \\ + grep -f - $orfs |\\ + cut -f1 -d" " |\\ + sed 's/^>//' > orf_ids + + # Retrieve ORF seqs from ORF IDs + seqkit grep \ + --pattern-file orf_ids \ + --out-file ${meta.id}_gtdb_input_orfs.faa.gz \ + $orfs + + cat $contigs > ${meta.id}.gtdb_input.fna + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + autometa: \$(autometa --version | sed -e 's/autometa: //g') + gtdb: $params.gtdb_version + END_VERSIONS + """ +} + + +// Autometa taxon assignment workflow +workflow GTDB_TAXON_ASSIGNMENT { + take: + split_metagenome_contigs + orfs + + main: + ch_versions = Channel.empty() + dbtype_ch = channel.value( 'gtdb') + + PREPARE_GTDB_DB() + ch_versions = ch_versions.mix(PREPARE_GTDB_DB.out.versions) + + // combine contigs and orfs into one channel + split_metagenome_contigs + .filter { meta, path -> + meta.taxon in ['bacteria', 'archaea'] + } + .map { meta, path -> + def cleanMeta = meta.findAll { k,v -> k != 'taxon' } + [cleanMeta, path] + } + .groupTuple(by: 0) + .combine( + orfs, by: 0 + ) + .set { contigs_and_orfs_ch } + + + EXTRACT_ORFS(contigs_and_orfs_ch) + + prot_accession2taxid_ch =Channel.fromPath(file("$baseDir/assets/dummy_file.txt", checkIfExists: true )) + + + TAXON_SPLIT( + EXTRACT_ORFS.out.split_orfs, + PREPARE_GTDB_DB.out.diamond_db, + PREPARE_GTDB_DB.out.gtdb_taxdump_directory, + prot_accession2taxid_ch, + dbtype_ch + ) + ch_versions = ch_versions.mix(TAXON_SPLIT.out.versions) + + + TAXON_SPLIT.out.taxonomically_split_fna.view { meta -> + println "taxonomically_split_fnabro: ${meta}" + } + + emit: + taxonomy = TAXON_SPLIT.out.taxonomy + taxonomically_split_fna = TAXON_SPLIT.out.taxonomically_split_fna + lca = TAXON_SPLIT.out.lca + votes = TAXON_SPLIT.out.votes + taxdump_files = PREPARE_GTDB_DB.out.gtdb_taxdump_directory + dbtype = dbtype_ch + versions = ch_versions + +} + diff --git a/subworkflows/local/taxon_assignment_ncbi.nf b/subworkflows/local/taxon_assignment_ncbi.nf new file mode 100644 index 000000000..ee8a511b2 --- /dev/null +++ b/subworkflows/local/taxon_assignment_ncbi.nf @@ -0,0 +1,44 @@ + +include { PREPARE_NR_DB } from './prepare_nr.nf' +include { PREPARE_TAXONOMY_DATABASES } from './prepare_ncbi_taxinfo.nf' +include { TAXON_SPLIT } from './taxon_split.nf' + +// Autometa taxon assignment workflow +workflow NCBI_TAXON_ASSIGNMENT { + take: + filtered_metagenome_fasta + orfs + + main: + ch_versions = Channel.empty() + dbtype_ch = channel.value( 'ncbi') + + PREPARE_TAXONOMY_DATABASES() + ch_versions = ch_versions.mix(PREPARE_TAXONOMY_DATABASES.out.versions) + + PREPARE_NR_DB() + ch_versions = ch_versions.mix(PREPARE_NR_DB.out.versions) + + contigs_and_orfs = filtered_metagenome_fasta.join(orfs) + + TAXON_SPLIT( + contigs_and_orfs, + PREPARE_NR_DB.out.diamond_db, + PREPARE_TAXONOMY_DATABASES.out.taxdump_files, + PREPARE_TAXONOMY_DATABASES.out.prot_accession2taxid, + dbtype_ch + ) + + ch_versions = ch_versions.mix(TAXON_SPLIT.out.versions) + + emit: + taxonomy = TAXON_SPLIT.out.taxonomy + taxonomically_split_fna = TAXON_SPLIT.out.taxonomically_split_fna + lca = TAXON_SPLIT.out.lca + votes = TAXON_SPLIT.out.votes + taxdump_files = PREPARE_TAXONOMY_DATABASES.out.taxdump_files + dbtype = dbtype_ch + versions = ch_versions + +} + diff --git a/subworkflows/local/taxon_split.nf b/subworkflows/local/taxon_split.nf new file mode 100644 index 000000000..312897d6c --- /dev/null +++ b/subworkflows/local/taxon_split.nf @@ -0,0 +1,90 @@ + +include { PREPARE_NR_DB } from './prepare_nr.nf' +include { PREPARE_TAXONOMY_DATABASES } from './prepare_ncbi_taxinfo.nf' +include { LCA } from './lca.nf' +include { MAJORITY_VOTE } from './../../modules/local/majority_vote.nf' +include { SPLIT_KINGDOMS } from './../../modules/local/split_kingdoms.nf' +include { DIAMOND_BLASTP } from './../../modules/local/diamond_blastp.nf' + + + + +// Autometa taxon assignment workflow +workflow TAXON_SPLIT { + take: + contigs_and_orfs + diamond_db_ch + taxdump_ch + prot_accession2taxid_ch + dbtype_ch + + main: + ch_versions = Channel.empty() + + contigs_and_orfs.multiMap { meta, fna_file, orfs_file -> + fna: [meta, fna_file] + orfs: [meta, orfs_file] + }.set { result } + + DIAMOND_BLASTP ( + result.orfs, + diamond_db_ch + ) + ch_versions = ch_versions.mix(DIAMOND_BLASTP.out.versions) + + LCA ( + DIAMOND_BLASTP.out.diamond_results, + taxdump_ch, + prot_accession2taxid_ch, + dbtype_ch + ) + ch_versions = ch_versions.mix(LCA.out.versions) + + MAJORITY_VOTE ( + LCA.out.lca, + taxdump_ch, + dbtype_ch + ) + ch_versions = ch_versions.mix(MAJORITY_VOTE.out.versions) + + result.fna + .join( + MAJORITY_VOTE.out.votes + ) + .set{split_kingdoms_input} + + SPLIT_KINGDOMS ( + split_kingdoms_input, + taxdump_ch, + dbtype_ch + ) + + // Step 1: Generate combinations of meta and fna_file and flatten them correctly + // handle if multiple fna files are present + SPLIT_KINGDOMS.out.fna.map { meta, fna_file -> + fna_file = fna_file instanceof List ? fna_file : [fna_file] + [[meta], fna_file].combinations() + }.flatten().collate(2) // Creates pairs of [meta, fna_file] + .set { tempch1 } + + // Step 2: Map each pair to set the taxon correctly for each meta-fna_file pair + tempch1.map{ meta, fna_file -> + // Set the taxon by extracting it from the fna_file name + def new_meta = meta.clone() + new_meta.taxon = fna_file.getName().tokenize('.')[-2] + return [new_meta, fna_file] // Return a copy of meta to ensure independent taxon setting + } .set { taxonomically_split_fna_ch } + + + ch_versions = ch_versions.mix(SPLIT_KINGDOMS.out.versions) + + emit: + taxonomy = SPLIT_KINGDOMS.out.taxonomy + taxonomically_split_fna = taxonomically_split_fna_ch + lca = LCA.out.lca + votes = MAJORITY_VOTE.out.votes + taxdump_files = taxdump_ch + dbtype = dbtype_ch + versions = ch_versions + +} diff --git a/subworkflows/local/taxonomy_workflow.nf b/subworkflows/local/taxonomy_workflow.nf new file mode 100644 index 000000000..061bf749d --- /dev/null +++ b/subworkflows/local/taxonomy_workflow.nf @@ -0,0 +1,53 @@ + +include { NCBI_TAXON_ASSIGNMENT as NCBI } from './taxon_assignment_ncbi.nf' +include { GTDB_TAXON_ASSIGNMENT as GTDB_REFINEMENT } from './taxon_assignment_gtdb.nf' + +// Autometa taxon assignment workflow +workflow TAXONOMY_WORKFLOW { + take: + filtered_metagenome_fasta + merged_prodigal + + main: + ch_versions = Channel.empty() + + if (params.taxonomy_aware) { + NCBI( + filtered_metagenome_fasta, + merged_prodigal + ) + ch_versions = ch_versions.mix(NCBI.out.versions) + + if (params.use_gtdb) { + GTDB_REFINEMENT( + NCBI.out.taxonomically_split_fna, + merged_prodigal + ) + ch_versions = ch_versions.mix(GTDB_REFINEMENT.out.versions) + taxonomy = GTDB_REFINEMENT.out.taxonomy + taxonomically_split_fna_ch = GTDB_REFINEMENT.out.taxonomically_split_fna + orf_votes = GTDB_REFINEMENT.out.lca + contig_votes = GTDB_REFINEMENT.out.votes + taxdump_files = GTDB_REFINEMENT.out.taxdump_files + dbtype = GTDB_REFINEMENT.out.dbtype + + } else { + taxonomy = NCBI.out.taxonomy + taxonomically_split_fna_ch = NCBI.out.taxonomically_split_fna + orf_votes = NCBI.out.lca + contig_votes = NCBI.out.votes + taxdump_files = NCBI.out.taxdump_files + dbtype = NCBI.out.dbtype + } + } + + emit: + taxonomy = taxonomy + taxonomically_split_fna = taxonomically_split_fna_ch + orf_votes = orf_votes + contig_votes = contig_votes + taxdump_files = taxdump_files + dbtype = dbtype + versions = ch_versions +} + diff --git a/subworkflows/local/unclustered_recruitment.nf b/subworkflows/local/unclustered_recruitment.nf deleted file mode 100644 index afd656f21..000000000 --- a/subworkflows/local/unclustered_recruitment.nf +++ /dev/null @@ -1,70 +0,0 @@ -params.binning_options = [:] -params.unclustered_recruitment_options = [:] -params.binning_summary_options = [:] -params.taxdump_tar_gz_dir = [:] - -include { RECRUIT } from './../../modules/local/unclustered_recruitment.nf' addParams( options: params.unclustered_recruitment_options ) -include { BINNING_SUMMARY as UNCLUSTERED_BINNING_SUMMARY } from './../../modules/local/binning_summary.nf' addParams( options: params.binning_summary_options, taxdump_tar_gz_dir: params.taxdump_tar_gz_dir ) - - -workflow UNCLUSTERED_RECRUITMENT { - - take: - metagenome - kmers_normalized - coverage - markers - taxon_assignments - binning - - main: - - kmers_normalized - .join( - coverage - ).join( - binning //BINNING.out.binning - ).join( - markers - ) - .set{coverage_binningout_markers} - - if (params.taxonomy_aware) { - coverage_binningout_markers - .join( - taxon_assignments - ) - .set{unclustered_recruitment_ch} - } else { - coverage_binningout_markers - .combine( - taxon_assignments - ) - .set{unclustered_recruitment_ch} - } - - RECRUIT ( - unclustered_recruitment_ch - ) - - RECRUIT.out.main - .join( - markers - ).join( - metagenome - ) - .set{unclustered_recruitment_summary_ch} - - // UNCLUSTERED_BINNING_SUMMARY ( - // unclustered_recruitment_summary_ch, - // "recruited_cluster" - // ) - - emit: - recruitment = RECRUIT.out.binning - recruitment_main = RECRUIT.out.main - all_binning_results = binning | mix(RECRUIT.out) | collect - // unclustered_recruitment_summary_stats = UNCLUSTERED_BINNING_SUMMARY.out.stats - // unclustered_recruitment_summary_taxa = UNCLUSTERED_BINNING_SUMMARY.out.taxonomies - // unclustered_recruitment_metabins = UNCLUSTERED_BINNING_SUMMARY.out.metabins -} diff --git a/workflows/autometa.nf b/workflows/autometa.nf index ae84aa7c0..1a0280281 100644 --- a/workflows/autometa.nf +++ b/workflows/autometa.nf @@ -4,49 +4,14 @@ * ------------------------------------------------- */ -def modules = params.modules.clone() - -if (params.single_db_dir) { - internal_nr_dmnd_dir = params.single_db_dir - internal_prot_accession2taxid_gz_dir = params.single_db_dir - internal_taxdump_tar_gz_dir = params.single_db_dir -} -// TODO: when implementing the ability to set individual DB dirs -// just override e.g. 'internal_nr_dmnd_location' here so users can set -// 'single_db_dir' but also set individual other db paths if they have them -// e.g. if they have nr.dmnd but not the other files. - -if (params.large_downloads_permission) { - // TODO: check if files already exist, if they don't fail the pipeline early at this stage -} else { - // TODO: check if files exist, if they don't fail the pipeline early at this stage -} - -// if these are still null then it means they weren't set, so make them null. -// this only works because the markov models are inside the docker image. -// that needs to be changed in future versions - -if (!params.taxonomy_aware) { - single_db_dir = null - internal_nr_dmnd_dir = null - internal_prot_accession2taxid_gz_dir = null - internal_taxdump_tar_gz_dir = null -} - /* * ------------------------------------------------- * Import local modules * ------------------------------------------------- */ - -include { GET_SOFTWARE_VERSIONS } from '../modules/local/get_software_versions' addParams( options: [publish_files : ['csv':'']] ) -include { SEQKIT_FILTER } from '../modules/local/seqkit_filter' addParams( options: [publish_files : ['*':'']] ) -include { SPADES_KMER_COVERAGE as COV_FROM_SPADES } from '../modules/local/spades_kmer_coverage' addParams( options: modules['spades_kmer_coverage'] ) -include { MARKERS } from '../modules/local/markers' addParams( options: modules['seqkit_split_options'] ) -include { BINNING } from '../modules/local/binning' addParams( options: modules['binning_options'] ) -include { RECRUIT } from '../modules/local/unclustered_recruitment' addParams( options: modules['unclustered_recruitment_options']) -include { BINNING_SUMMARY } from '../modules/local/binning_summary' addParams( options: modules['binning_summary_options'] ) -include { MOCK_DATA_REPORT } from '../modules/local/mock_data_reporter' addParams( options: modules['mock_data_report'] ) +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { MARKERS } from '../modules/local/markers' +include { MOCK_DATA_REPORT } from '../modules/local/mock_data_reporter' /* * ------------------------------------------------- @@ -56,7 +21,7 @@ include { MOCK_DATA_REPORT } from '../modules/local/mock_ // https://github.com/nf-core/modules/tree/master/modules // https://nf-co.re/tools/#modules // nf-core modules --help -include { PRODIGAL } from './../modules/nf-core/modules/prodigal/main' addParams( options: modules['prodigal_options'] ) +include { PRODIGAL } from './../modules/nf-core/prodigal/main.nf' /* * ------------------------------------------------- @@ -64,78 +29,28 @@ include { PRODIGAL } from './../modules/nf-core/modules/prodigal/main' addParam * ------------------------------------------------- */ -include { CREATE_MOCK } from '../subworkflows/local/mock_data' addParams( get_genomes_for_mock: modules['get_genomes_for_mock']) -include { INPUT_CHECK } from '../subworkflows/local/input_check' addParams( ) -include { CONTIG_COVERAGE as COVERAGE } from '../subworkflows/local/contig_coverage' addParams( align_reads_options: modules['align_reads_options'], samtools_viewsort_options: modules['samtools_viewsort_options'], bedtools_genomecov_options: modules['bedtools_genomecov_options']) -include { KMERS } from '../subworkflows/local/kmers' addParams( count_kmers_options: modules['count_kmers_options'], normalize_kmers_options: modules['normalize_kmers_options'], embed_kmers_options: modules['embed_kmers_options']) -include { TAXON_ASSIGNMENT } from '../subworkflows/local/taxon_assignment' addParams( options: modules['taxon_assignment'], majority_vote_options: modules['majority_vote_options'], split_kingdoms_options: modules['split_kingdoms_options'], nr_dmnd_dir: internal_nr_dmnd_dir, taxdump_tar_gz_dir: internal_taxdump_tar_gz_dir, prot_accession2taxid_gz_dir: internal_prot_accession2taxid_gz_dir, diamond_blastp_options: modules['diamond_blastp_options'], large_downloads_permission: params.large_downloads_permission ) +include { COVERAGE } from '../subworkflows/local/coverage' +include { KMERS } from '../subworkflows/local/kmers' +include { PROCESS_METAGENOME } from '../subworkflows/local/process_metagenome' +include { TAXONOMY_WORKFLOW } from '../subworkflows/local/taxonomy_workflow' +include { BIN } from '../subworkflows/local/binning' workflow AUTOMETA { - // Software versions channel - Channel - .empty() - .set{ch_software_versions} - // Samplesheet channel - Channel - .fromPath(params.input) - .set{samplesheet_ch} - - // Set the metagenome and coverage channels - if (params.mock_test){ - CREATE_MOCK() - CREATE_MOCK.out.fasta - .set{metagenome_ch} - Channel - .empty() - .set{coverage_tab_ch} - } else { - INPUT_CHECK(samplesheet_ch) - INPUT_CHECK.out.metagenome - .set{metagenome_ch} - INPUT_CHECK.out.coverage - .set{coverage_tab_ch} - } + ch_versions = Channel.empty() - SEQKIT_FILTER( - metagenome_ch - ) - SEQKIT_FILTER.out.fasta - .set{fasta_ch} - - /* - * ------------------------------------------------- - * Find coverage, currently only pulling from SPADES output - * ------------------------------------------------- - */ + PROCESS_METAGENOME() + ch_versions = ch_versions.mix(PROCESS_METAGENOME.out.versions) - - if (!params.mock_test) { - fasta_ch - .join(INPUT_CHECK.out.reads) - .set{coverage_input_ch} - } else { - Channel - .empty() - .set{coverage_input_ch} - } - - COVERAGE ( - coverage_input_ch + COVERAGE( + PROCESS_METAGENOME.out.filtered_metagenome_fasta, + PROCESS_METAGENOME.out.filtered_metagenome_fasta_and_reads, + PROCESS_METAGENOME.out.user_provided_coverage_table ) - COVERAGE.out.coverage - .set{contig_coverage_ch} + ch_versions = ch_versions.mix(COVERAGE.out.versions) - COV_FROM_SPADES ( - fasta_ch, - ) - COV_FROM_SPADES.out.coverage - .set{spades_kmer_coverage_ch} - // https://nextflow-io.github.io/patterns/index.html#_conditional_process_executions - contig_coverage_ch - .mix(spades_kmer_coverage_ch) - .mix(coverage_tab_ch) - .set{coverage_ch} + filtered_metagenome_fasta = PROCESS_METAGENOME.out.filtered_metagenome_fasta + coverage_ch = COVERAGE.out.coverage_ch /* * ------------------------------------------------- @@ -144,9 +59,10 @@ workflow AUTOMETA { */ PRODIGAL ( - fasta_ch, + filtered_metagenome_fasta, "gbk" ) + ch_versions = ch_versions.mix(PRODIGAL.out.versions) PRODIGAL.out.amino_acid_fasta .set{orfs_ch} @@ -158,26 +74,31 @@ workflow AUTOMETA { */ if (params.taxonomy_aware) { - TAXON_ASSIGNMENT ( - fasta_ch, + TAXONOMY_WORKFLOW ( + filtered_metagenome_fasta, orfs_ch ) - TAXON_ASSIGNMENT.out.taxonomy - .set{taxonomy_results} - if (params.kingdom.equals('bacteria')) { - TAXON_ASSIGNMENT.out.bacteria - .set{kmers_input_ch} - } else { - // params.kingdom.equals('archaea') - TAXON_ASSIGNMENT.out.archaea - .set{kmers_input_ch} - } + ch_versions = ch_versions.mix(TAXONOMY_WORKFLOW.out.versions) + + taxonomy_results = TAXONOMY_WORKFLOW.out.taxonomy + taxdump_files = TAXONOMY_WORKFLOW.out.taxdump_files + taxonomically_split_fna_ch = TAXONOMY_WORKFLOW.out.taxonomically_split_fna + } else { - fasta_ch - .set{kmers_input_ch} + filtered_metagenome_fasta + .map { meta, fna -> + def new_meta = meta.clone() + new_meta['taxon'] = 'unclassified' + return [new_meta, fna] + } + .set{taxonomically_split_fna_ch} + Channel .fromPath(file("$baseDir/assets/dummy_file.txt", checkIfExists: true )) .set{taxonomy_results} + Channel + .fromPath(file("$baseDir/assets/dummy_file.txt", checkIfExists: true )) + .set{taxdump_files} } /* @@ -186,106 +107,59 @@ workflow AUTOMETA { * ------------------------------------------------- */ - KMERS( - kmers_input_ch - ) - KMERS.out.normalized - .set{kmers_normalized_ch} - - KMERS.out.embedded - .set{kmers_embedded_ch} - + KMERS( taxonomically_split_fna_ch ) + ch_versions = ch_versions.mix(KMERS.out.versions) // -------------------------------------------------------------------------------- // Run hmmscan and look for marker genes in contig orfs // -------------------------------------------------------------------------------- - - MARKERS( - orfs_ch - ) - MARKERS.out.markers_tsv - .set{markers_ch} - - // Prepare inputs for binning channel - kmers_embedded_ch - .join(coverage_ch) - .join(SEQKIT_FILTER.out.gc_content) - .join(markers_ch) - .set{binning_ch} - if (params.taxonomy_aware) { - binning_ch - .join(taxonomy_results) - .set{binning_ch} - } else { - binning_ch - .combine(taxonomy_results) - .set{binning_ch} - } - - BINNING( - binning_ch - ) - - if (params.unclustered_recruitment) { - // Prepare inputs for recruitment channel - kmers_normalized_ch - .join(coverage_ch) - .join(BINNING.out.main) - .join(markers_ch) - .set{recruitment_ch} - if (params.taxonomy_aware) { - recruitment_ch - .join(taxonomy_results) - .set{recruitment_ch} - } else { - recruitment_ch - .combine(taxonomy_results) - .set{recruitment_ch} + Channel + .fromList(['bacteria', 'archaea']) + .set { kingdoms } + + // Ensure orfs_ch is defined before using + orfs_ch + .combine(kingdoms) + .map { pair -> + def (meta, orfs_file, kingdom) = pair // Correctly extract values from pair + def new_meta = meta.clone() + new_meta['taxon'] = kingdom + return [new_meta, orfs_file] } - RECRUIT( - recruitment_ch - ) - RECRUIT.out.main - .set{binning_results_ch} - Channel - .value("recruited_cluster") - .set{binning_col} - } else { - BINNING.out.main - .set{binning_results_ch} - Channel - .value("cluster") - .set{binning_col} - } + .set { orfs_taxon_ch } - // Set inputs for binning summary - binning_results_ch - .join(markers_ch) - .join(fasta_ch) - .set{binning_summary_ch} + MARKERS( orfs_taxon_ch ) - if (params.single_db_dir) { - ncbi = file(params.single_db_dir) - } else { - ncbi = file("$baseDir/assets/dummy_file.txt") - } + ch_versions = ch_versions.mix(MARKERS.out.versions) - BINNING_SUMMARY( - binning_summary_ch, - binning_col, - ncbi, - ) + markers_ch = MARKERS.out.markers_tsv - if (params.mock_test){ - binning_results_ch - .join(CREATE_MOCK.out.assembly_to_locus) - .join(CREATE_MOCK.out.assembly_report) - .set { mock_input_ch } + BIN( + taxonomically_split_fna_ch, + PROCESS_METAGENOME.out.filtered_metagenome_gc_content, + markers_ch, + coverage_ch, + taxonomy_results, + KMERS.out.embedded, + taxdump_files, + TAXONOMY_WORKFLOW.out.dbtype + ) - MOCK_DATA_REPORT( - mock_input_ch, - file("$baseDir/lib/mock_data_report.Rmd") - ) - } + // if (params.mock_test){ + // BIN.out.binning_results + // .join(PROCESS_METAGENOME.out.assembly_to_locus) + // .join(PROCESS_METAGENOME.out.assembly_report) + // .set { mock_input_ch } + + // MOCK_DATA_REPORT( + // mock_input_ch, + // file("$baseDir/lib/mock_data_report.Rmd") + // ) + // ch_versions = ch_versions.mix(MOCK_DATA_REPORT.out.versions) + // } + + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) }
$group