diff --git a/subworkflows/ebi-metagenomics/pathofact2/main.nf b/subworkflows/ebi-metagenomics/pathofact2/main.nf new file mode 100644 index 00000000..227c2fa6 --- /dev/null +++ b/subworkflows/ebi-metagenomics/pathofact2/main.nf @@ -0,0 +1,115 @@ +// Subworkflow to generate toxins and virulence factors annotation from protein sequences +// Outputs are filtered by threshold and integrated into a single GFF3 format output +include { PATHOFACT2_DOWNLOADDATA } from '../../../modules/ebi-metagenomics/pathofact2/downloaddata/main' +include { PATHOFACT2_TOXINS } from '../../../modules/ebi-metagenomics/pathofact2/toxins/main' +include { PATHOFACT2_VIRULENCE } from '../../../modules/ebi-metagenomics/pathofact2/virulence/main' +include { PATHOFACT2_INTEGRATOR } from '../../../modules/ebi-metagenomics/pathofact2/integrator/main' +include { PATHOFACT2_EXTRACTFASTA } from '../../../modules/ebi-metagenomics/pathofact2/extractfasta/main' +include { LOCALCDSEARCH_ANNOTATE } from '../../../modules/nf-core/localcdsearch/annotate/main' +include { LOCALCDSEARCH_DOWNLOAD } from '../../../modules/nf-core/localcdsearch/download/main' +include { DIAMOND_BLASTP } from '../../../modules/nf-core/diamond/blastp/main' +include { DIAMOND_MAKEDB } from '../../../modules/nf-core/diamond/makedb/main' +include { WGET } from '../../../modules/nf-core/wget/main' + +workflow PATHOFACT2 { + take: + ch_inputs // channel: tuple( val(meta), path(aminoacids), path(cds_gff), path(ips_tsv) ) + ch_models // channel: path( pathofact2_db ) + ch_vfdb // channel: path( vfdb ) + ch_cdd // channel: path( cdd_db ) + ch_zenodo_id // channel: value( pathofact2_db_zenodo_id ) + ch_vfdb_url // channel: tuple( val(meta2), val(vfdb_url) ) + + main: + ch_versions = channel.empty() + + // Extract individual components from input channel + ch_faa = ch_inputs.map{ meta, aminoacids, _cds_gff, _ips_tsv -> tuple(meta, aminoacids) } + ch_gff = ch_inputs.map{ meta, _aminoacids, cds_gff, _ips_tsv -> tuple(meta, cds_gff) } + ch_ips = ch_inputs.map{ meta, _aminoacids, _cds_gff, ips_tsv -> tuple(meta, ips_tsv) } + + // Split inputs based on whether IPS annotation is provided + ch_ips + .branch { meta, ips_tsv -> + with_ips: ips_tsv + return tuple(meta, ips_tsv) + without_ips: !ips_tsv + return meta + } + .set { ch_ips_branched } + + ch_with_ips = ch_ips_branched.with_ips + ch_without_ips = ch_ips_branched.without_ips + + // Preparing databases + if (ch_models) { + pathofact_models = ch_models + } else { + PATHOFACT2_DOWNLOADDATA(ch_zenodo_id) + pathofact_models = PATHOFACT2_DOWNLOADDATA.out.zenodo_file + } + + if (ch_vfdb) { + vfdb_diamond_db = ch_vfdb + } else { + WGET(ch_vfdb_url) + ch_versions = ch_versions.mix(WGET.out.versions.first()) + DIAMOND_MAKEDB(WGET.out.outfile, [], [], []) + ch_versions = ch_versions.mix(DIAMOND_MAKEDB.out.versions.first()) + vfdb_diamond_db = DIAMOND_MAKEDB.out.db + } + + // Prepare CDD database (will only be used if ch_without_ips has data) + if (ch_cdd) { + cdd_database = ch_cdd + } else { + LOCALCDSEARCH_DOWNLOAD(['cdd_ncbi']) + cdd_database = LOCALCDSEARCH_DOWNLOAD.out.db + } + + // Running prediction + PATHOFACT2_TOXINS( ch_faa, pathofact_models ) + + PATHOFACT2_VIRULENCE( ch_faa, pathofact_models ) + + // Searching for hits in VFDB + DIAMOND_BLASTP( ch_faa, vfdb_diamond_db, 6, 'qseqid sseqid pident length qlen slen evalue bitscore') + ch_versions = ch_versions.mix(DIAMOND_BLASTP.out.versions.first()) + + // Extracting positive matches + ch_extractfasta_input = ch_faa + .join(DIAMOND_BLASTP.out.txt) + .join(PATHOFACT2_TOXINS.out.tsv) + .join(PATHOFACT2_VIRULENCE.out.tsv) + PATHOFACT2_EXTRACTFASTA(ch_extractfasta_input) + + // Running annotation using local-cd-search when ips_tsv is not provided + ch_fasta_for_cdd = PATHOFACT2_EXTRACTFASTA.out.fasta + .join(ch_without_ips) + LOCALCDSEARCH_ANNOTATE(ch_fasta_for_cdd, cdd_database, false) + + // Combine IPS annotations with CDD annotations + prot_annot = ch_with_ips.mix(LOCALCDSEARCH_ANNOTATE.out.result) + + // Set annotation type based on source + annot_type = ch_with_ips + .map { meta, _ips_tsv -> tuple(meta, 'ips') } + .mix( + LOCALCDSEARCH_ANNOTATE.out.result.map { meta, _annot -> tuple(meta, 'cdd') } + ) + + // Integrating results in a single gff file + ch_for_integrator = ch_gff + .join(prot_annot) + .join(PATHOFACT2_EXTRACTFASTA.out.tsv) + .join(annot_type) + PATHOFACT2_INTEGRATOR(ch_for_integrator) + + // Handle cases where no predictions are made (integrator produces no output) + ch_gff_output = PATHOFACT2_INTEGRATOR.out.gff.ifEmpty([]) + + emit: + gff = ch_gff_output // channel: tuple( val(meta), path(gff) ) + versions = ch_versions // channel: [ versions.yml ] + +} diff --git a/subworkflows/ebi-metagenomics/pathofact2/meta.yml b/subworkflows/ebi-metagenomics/pathofact2/meta.yml new file mode 100644 index 00000000..62efccc2 --- /dev/null +++ b/subworkflows/ebi-metagenomics/pathofact2/meta.yml @@ -0,0 +1,89 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/EBI-Metagenomics/nf-modules/main/subworkflows/yaml-schema.json +name: "pathofact2" +description: Pipeline for predicting virulence Factors and toxins in protein sequences +keywords: + - pathofact2 + - virulence + - toxin + - vfdb + - cdc + - gff3 +components: + - localcdsearch/annotate: + git_remote: https://github.com/nf-core/modules.git + - localcdsearch/download: + git_remote: https://github.com/nf-core/modules.git + - diamond/blastp: + git_remote: https://github.com/nf-core/modules.git + - diamond/makedb: + git_remote: https://github.com/nf-core/modules.git + - wget: + git_remote: https://github.com/nf-core/modules.git + - pathofact2/downloaddata + - pathofact2/toxins + - pathofact2/virulence + - pathofact2/integrator + - pathofact2/extractfasta + +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + ch_inputs: + type: file + description: | + The input channel containing the protein fasta file, the gff file and the interproscan tsv file + e.g. `[ [meta], aminoacids, cds_gff, ips_tsv ]` + - ch_models: + type: directory + description: | + The input channel containing the path to the Pathofact2 models database directory + - ch_vfdb: + type: directory + description: | + The input channel containing the path to the Virulence Factors Database (VFDB) directory + - ch_cdd: + type: directory + description: | + The input channel containing the path to Conserved Domains Database (CDD) directory + - ch_zenodo_id: + type: string + description: | + The id of the pathofact database in zenodo + - meta2: + type: map + description: | + Groovy Map containing the VFDB databse id + e.g. `[ id:'VFDB_setB_pro' ]` + ch_vfdb_url: + type: string + description: | + The url to download VFDB using wget + e.g. `[[id:'VFDB_setB_pro'], 'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz']` +output: + - gff: + description: | + Channel containing a gff file with Pathofact2 annotation + Structure: [ val(meta), path("*.gff") ] + meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1' ]` + "*.gff": + type: file + description: gff file with Pathofact2 annotation + pattern: "*.gff" + - versions: + type: file + description: | + File containing software versions + Structure: [ path(versions.yml) ] + pattern: "versions.yml" + +authors: + - "@Ales-ibt" +maintainers: + - "@Ales-ibt" diff --git a/subworkflows/ebi-metagenomics/pathofact2/tests/data/negative_test.faa.gz b/subworkflows/ebi-metagenomics/pathofact2/tests/data/negative_test.faa.gz new file mode 100644 index 00000000..3ef04759 Binary files /dev/null and b/subworkflows/ebi-metagenomics/pathofact2/tests/data/negative_test.faa.gz differ diff --git a/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_ips.tsv.gz b/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_ips.tsv.gz new file mode 100644 index 00000000..d55d4460 Binary files /dev/null and b/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_ips.tsv.gz differ diff --git a/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_protein.faa.gz b/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_protein.faa.gz new file mode 100644 index 00000000..42408274 Binary files /dev/null and b/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_protein.faa.gz differ diff --git a/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_protein.gff.gz b/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_protein.gff.gz new file mode 100644 index 00000000..49a6b000 Binary files /dev/null and b/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_protein.gff.gz differ diff --git a/subworkflows/ebi-metagenomics/pathofact2/tests/main.nf.test b/subworkflows/ebi-metagenomics/pathofact2/tests/main.nf.test new file mode 100644 index 00000000..22d638f5 --- /dev/null +++ b/subworkflows/ebi-metagenomics/pathofact2/tests/main.nf.test @@ -0,0 +1,148 @@ +nextflow_workflow { + + name "Test Subworkflow PATHOFACT2" + script "../main.nf" + workflow "PATHOFACT2" + config "./nextflow.config" + + tag "subworkflows" + tag "subworkflows_ebimetagenomics" + tag "subworkflows/pathofact2" + tag "pathofact2/toxins" + tag "pathofact2/downloaddata" + tag "pathofact2/virulence" + tag "pathofact2/integrator" + tag "pathofact2/extractfasta" + tag "nf-core/localcdsearch/annotate" + tag "nf-core/localcdsearch/download" + tag "nf-core/diamond/blastp" + tag "nf-core/diamond/makedb" + tag "nf-core/wget" + + setup { + nfcoreInitialise("${launchDir}/library/") + nfcoreInstall( + "${launchDir}/library/", + [ + "localcdsearch/annotate", + "localcdsearch/download", + "diamond/blastp", + "diamond/makedb", + "wget", + ] + ) + nfcoreLink("${launchDir}/library/", "${baseDir}/modules/") + } + + test("proteins_test - positive prediction - ips") { + + tag "positive_ips" + + when { + workflow { + """ + input[0] = channel.of([ + [id:'test'], + file("${moduleDir}/tests/data/test_protein.faa.gz", checkIfExists: true), + file("${moduleDir}/tests/data/test_protein.gff.gz", checkIfExists: true), + file("${moduleDir}/tests/data/test_ips.tsv.gz", checkIfExists: true) + ]) + input[1] = null + input[2] = null + input[3] = null + input[4] = '18223764' + input[5] = channel.of([ + [id:'VFDB_setB_pro'], + 'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz' + ]) + """ + } + } + then { + assert workflow.success + assert workflow.trace.tasks().size() > 0 + + assertAll( + { assert snapshot( + workflow.out.gff, + workflow.out.versions + ).match() } + ) + } + } + + test("proteins_test - positive prediction - no ips, run cdd") { + + tag "positive_cdd" + + when { + workflow { + """ + input[0] = channel.of([ + [id:'test'], + file("${moduleDir}/tests/data/test_protein.faa.gz", checkIfExists: true), + file("${moduleDir}/tests/data/test_protein.gff.gz", checkIfExists: true), + null + ]) + input[1] = null + input[2] = null + input[3] = null + input[4] = '18223764' + input[5] = channel.of([ + [id:'VFDB_setB_pro'], + 'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz' + ]) + """ + } + } + then { + assert workflow.success + assert workflow.trace.tasks().size() > 0 + + assertAll( + { assert snapshot( + workflow.out.gff, + workflow.out.versions + ).match() } + ) + } + } + + test("proteins_test - negative prediction - ips") { + + tag "negative_ips" + + when { + workflow { + """ + input[0] = channel.of([ + [id:'test'], + file("${moduleDir}/tests/data/negative_test.faa.gz", checkIfExists: true), + file("${moduleDir}/tests/data/test_protein.gff.gz", checkIfExists: true), + file("${moduleDir}/tests/data/test_ips.tsv.gz", checkIfExists: true) + ]) + input[1] = null + input[2] = null + input[3] = null + input[4] = '18223764' + input[5] = channel.of([ + [id:'VFDB_setB_pro'], + 'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz' + ]) + """ + } + } + then { + assert workflow.success + assert workflow.trace.tasks().size() > 0 + + assertAll( + { assert snapshot( + workflow.out.gff, + workflow.out.versions + ).match() } + ) + } + } + +} diff --git a/subworkflows/ebi-metagenomics/pathofact2/tests/main.nf.test.snap b/subworkflows/ebi-metagenomics/pathofact2/tests/main.nf.test.snap new file mode 100644 index 00000000..3948d5fc --- /dev/null +++ b/subworkflows/ebi-metagenomics/pathofact2/tests/main.nf.test.snap @@ -0,0 +1,65 @@ +{ + "proteins_test - negative prediction - ips": { + "content": [ + [ + [ + + ] + ], + [ + "versions.yml:md5,4c287e9f30be1720656657df175453ef", + "versions.yml:md5,6852c67539b59298b4c56b8907955e5a", + "versions.yml:md5,d28a54bb3b80600719af443deaa51f93" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.0" + }, + "timestamp": "2026-02-04T12:55:41.086157" + }, + "proteins_test - positive prediction - ips": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_pathofact2.gff:md5,ea3d8767bbf7d1c1aa2ece9129b8da46" + ] + ], + [ + "versions.yml:md5,4c287e9f30be1720656657df175453ef", + "versions.yml:md5,6852c67539b59298b4c56b8907955e5a", + "versions.yml:md5,d28a54bb3b80600719af443deaa51f93" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.0" + }, + "timestamp": "2026-02-04T12:49:45.579021" + }, + "proteins_test - positive prediction - no ips, run cdd": { + "content": [ + [ + [ + { + "id": "test" + }, + "test_pathofact2.gff:md5,de37e2d5e1e790abfe767bcc7a802731" + ] + ], + [ + "versions.yml:md5,4c287e9f30be1720656657df175453ef", + "versions.yml:md5,6852c67539b59298b4c56b8907955e5a", + "versions.yml:md5,d28a54bb3b80600719af443deaa51f93" + ] + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.0" + }, + "timestamp": "2026-02-04T12:52:39.776572" + } +} \ No newline at end of file diff --git a/subworkflows/ebi-metagenomics/pathofact2/tests/nextflow.config b/subworkflows/ebi-metagenomics/pathofact2/tests/nextflow.config new file mode 100644 index 00000000..d0a1a3ea --- /dev/null +++ b/subworkflows/ebi-metagenomics/pathofact2/tests/nextflow.config @@ -0,0 +1,19 @@ +nextflow.enable.moduleBinaries = true +process { + withName: DIAMOND_BLASTP { + ext.args = [ + "--max-target-seqs", "25", + "--evalue", "1e-5", + "--id", "30", + "--query-cover", "60", + "--subject-cover", "50", + ].join(' ') + } + + withName: PATHOFACT2_TOXINS { + ext.args = [ + "-k", "5" + ].join(' ') + } + +}