-
Notifications
You must be signed in to change notification settings - Fork 1
Feature/pathofact2 subwf #141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
10 commits
Select commit
Hold shift + click to select a range
bd17d60
Fix subworkflow tests and pre-commit
Ales-ibt b1495cb
Debugging when no ips file
Ales-ibt b254ba3
Update subworkflows/ebi-metagenomics/pathofact2/main.nf
Ales-ibt a61d0dd
Update subworkflows/ebi-metagenomics/pathofact2/main.nf
Ales-ibt f2e0ae8
Update subworkflows/ebi-metagenomics/pathofact2/tests/main.nf.test
Ales-ibt 3820f82
Update subworkflows/ebi-metagenomics/pathofact2/tests/main.nf.test
Ales-ibt f0dde08
Adding negative test
Ales-ibt 3946ea0
Move hardcoded values to params and improve empty outputs handling
Ales-ibt bc46a38
Update subworkflows/ebi-metagenomics/pathofact2/meta.yml
Ales-ibt a49e612
Update subworkflows/ebi-metagenomics/pathofact2/main.nf
Ales-ibt File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,115 @@ | ||
| // Subworkflow to generate toxins and virulence factors annotation from protein sequences | ||
| // Outputs are filtered by threshold and integrated into a single GFF3 format output | ||
| include { PATHOFACT2_DOWNLOADDATA } from '../../../modules/ebi-metagenomics/pathofact2/downloaddata/main' | ||
| include { PATHOFACT2_TOXINS } from '../../../modules/ebi-metagenomics/pathofact2/toxins/main' | ||
| include { PATHOFACT2_VIRULENCE } from '../../../modules/ebi-metagenomics/pathofact2/virulence/main' | ||
| include { PATHOFACT2_INTEGRATOR } from '../../../modules/ebi-metagenomics/pathofact2/integrator/main' | ||
| include { PATHOFACT2_EXTRACTFASTA } from '../../../modules/ebi-metagenomics/pathofact2/extractfasta/main' | ||
| include { LOCALCDSEARCH_ANNOTATE } from '../../../modules/nf-core/localcdsearch/annotate/main' | ||
| include { LOCALCDSEARCH_DOWNLOAD } from '../../../modules/nf-core/localcdsearch/download/main' | ||
| include { DIAMOND_BLASTP } from '../../../modules/nf-core/diamond/blastp/main' | ||
| include { DIAMOND_MAKEDB } from '../../../modules/nf-core/diamond/makedb/main' | ||
| include { WGET } from '../../../modules/nf-core/wget/main' | ||
|
|
||
| workflow PATHOFACT2 { | ||
| take: | ||
| ch_inputs // channel: tuple( val(meta), path(aminoacids), path(cds_gff), path(ips_tsv) ) | ||
| ch_models // channel: path( pathofact2_db ) | ||
| ch_vfdb // channel: path( vfdb ) | ||
| ch_cdd // channel: path( cdd_db ) | ||
| ch_zenodo_id // channel: value( pathofact2_db_zenodo_id ) | ||
| ch_vfdb_url // channel: tuple( val(meta2), val(vfdb_url) ) | ||
|
|
||
| main: | ||
| ch_versions = channel.empty() | ||
|
|
||
| // Extract individual components from input channel | ||
| ch_faa = ch_inputs.map{ meta, aminoacids, _cds_gff, _ips_tsv -> tuple(meta, aminoacids) } | ||
| ch_gff = ch_inputs.map{ meta, _aminoacids, cds_gff, _ips_tsv -> tuple(meta, cds_gff) } | ||
| ch_ips = ch_inputs.map{ meta, _aminoacids, _cds_gff, ips_tsv -> tuple(meta, ips_tsv) } | ||
|
|
||
| // Split inputs based on whether IPS annotation is provided | ||
| ch_ips | ||
| .branch { meta, ips_tsv -> | ||
| with_ips: ips_tsv | ||
| return tuple(meta, ips_tsv) | ||
| without_ips: !ips_tsv | ||
| return meta | ||
| } | ||
| .set { ch_ips_branched } | ||
|
|
||
| ch_with_ips = ch_ips_branched.with_ips | ||
| ch_without_ips = ch_ips_branched.without_ips | ||
|
|
||
| // Preparing databases | ||
| if (ch_models) { | ||
| pathofact_models = ch_models | ||
| } else { | ||
| PATHOFACT2_DOWNLOADDATA(ch_zenodo_id) | ||
| pathofact_models = PATHOFACT2_DOWNLOADDATA.out.zenodo_file | ||
| } | ||
|
|
||
| if (ch_vfdb) { | ||
| vfdb_diamond_db = ch_vfdb | ||
| } else { | ||
| WGET(ch_vfdb_url) | ||
| ch_versions = ch_versions.mix(WGET.out.versions.first()) | ||
| DIAMOND_MAKEDB(WGET.out.outfile, [], [], []) | ||
| ch_versions = ch_versions.mix(DIAMOND_MAKEDB.out.versions.first()) | ||
| vfdb_diamond_db = DIAMOND_MAKEDB.out.db | ||
| } | ||
|
|
||
| // Prepare CDD database (will only be used if ch_without_ips has data) | ||
| if (ch_cdd) { | ||
| cdd_database = ch_cdd | ||
| } else { | ||
| LOCALCDSEARCH_DOWNLOAD(['cdd_ncbi']) | ||
| cdd_database = LOCALCDSEARCH_DOWNLOAD.out.db | ||
| } | ||
|
|
||
Ales-ibt marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| // Running prediction | ||
| PATHOFACT2_TOXINS( ch_faa, pathofact_models ) | ||
|
|
||
| PATHOFACT2_VIRULENCE( ch_faa, pathofact_models ) | ||
|
|
||
| // Searching for hits in VFDB | ||
| DIAMOND_BLASTP( ch_faa, vfdb_diamond_db, 6, 'qseqid sseqid pident length qlen slen evalue bitscore') | ||
| ch_versions = ch_versions.mix(DIAMOND_BLASTP.out.versions.first()) | ||
|
|
||
| // Extracting positive matches | ||
| ch_extractfasta_input = ch_faa | ||
| .join(DIAMOND_BLASTP.out.txt) | ||
| .join(PATHOFACT2_TOXINS.out.tsv) | ||
| .join(PATHOFACT2_VIRULENCE.out.tsv) | ||
mberacochea marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| PATHOFACT2_EXTRACTFASTA(ch_extractfasta_input) | ||
|
|
||
| // Running annotation using local-cd-search when ips_tsv is not provided | ||
| ch_fasta_for_cdd = PATHOFACT2_EXTRACTFASTA.out.fasta | ||
| .join(ch_without_ips) | ||
| LOCALCDSEARCH_ANNOTATE(ch_fasta_for_cdd, cdd_database, false) | ||
|
|
||
| // Combine IPS annotations with CDD annotations | ||
| prot_annot = ch_with_ips.mix(LOCALCDSEARCH_ANNOTATE.out.result) | ||
|
|
||
| // Set annotation type based on source | ||
| annot_type = ch_with_ips | ||
| .map { meta, _ips_tsv -> tuple(meta, 'ips') } | ||
| .mix( | ||
| LOCALCDSEARCH_ANNOTATE.out.result.map { meta, _annot -> tuple(meta, 'cdd') } | ||
| ) | ||
|
|
||
| // Integrating results in a single gff file | ||
| ch_for_integrator = ch_gff | ||
| .join(prot_annot) | ||
| .join(PATHOFACT2_EXTRACTFASTA.out.tsv) | ||
| .join(annot_type) | ||
| PATHOFACT2_INTEGRATOR(ch_for_integrator) | ||
|
|
||
| // Handle cases where no predictions are made (integrator produces no output) | ||
| ch_gff_output = PATHOFACT2_INTEGRATOR.out.gff.ifEmpty([]) | ||
|
|
||
| emit: | ||
| gff = ch_gff_output // channel: tuple( val(meta), path(gff) ) | ||
| versions = ch_versions // channel: [ versions.yml ] | ||
|
|
||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,89 @@ | ||
| # yaml-language-server: $schema=https://raw.githubusercontent.com/EBI-Metagenomics/nf-modules/main/subworkflows/yaml-schema.json | ||
| name: "pathofact2" | ||
| description: Pipeline for predicting virulence Factors and toxins in protein sequences | ||
| keywords: | ||
| - pathofact2 | ||
| - virulence | ||
| - toxin | ||
| - vfdb | ||
| - cdc | ||
| - gff3 | ||
| components: | ||
| - localcdsearch/annotate: | ||
| git_remote: https://github.com/nf-core/modules.git | ||
| - localcdsearch/download: | ||
| git_remote: https://github.com/nf-core/modules.git | ||
| - diamond/blastp: | ||
| git_remote: https://github.com/nf-core/modules.git | ||
| - diamond/makedb: | ||
| git_remote: https://github.com/nf-core/modules.git | ||
| - wget: | ||
| git_remote: https://github.com/nf-core/modules.git | ||
| - pathofact2/downloaddata | ||
| - pathofact2/toxins | ||
| - pathofact2/virulence | ||
| - pathofact2/integrator | ||
| - pathofact2/extractfasta | ||
|
|
||
| input: | ||
| - meta: | ||
| type: map | ||
| description: | | ||
| Groovy Map containing sample information | ||
| e.g. `[ id:'sample1' ]` | ||
| ch_inputs: | ||
| type: file | ||
| description: | | ||
| The input channel containing the protein fasta file, the gff file and the interproscan tsv file | ||
| e.g. `[ [meta], aminoacids, cds_gff, ips_tsv ]` | ||
| - ch_models: | ||
| type: directory | ||
| description: | | ||
| The input channel containing the path to the Pathofact2 models database directory | ||
| - ch_vfdb: | ||
| type: directory | ||
| description: | | ||
| The input channel containing the path to the Virulence Factors Database (VFDB) directory | ||
| - ch_cdd: | ||
| type: directory | ||
| description: | | ||
| The input channel containing the path to Conserved Domains Database (CDD) directory | ||
| - ch_zenodo_id: | ||
| type: string | ||
| description: | | ||
| The id of the pathofact database in zenodo | ||
| - meta2: | ||
| type: map | ||
| description: | | ||
| Groovy Map containing the VFDB databse id | ||
| e.g. `[ id:'VFDB_setB_pro' ]` | ||
| ch_vfdb_url: | ||
| type: string | ||
| description: | | ||
| The url to download VFDB using wget | ||
| e.g. `[[id:'VFDB_setB_pro'], 'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz']` | ||
| output: | ||
| - gff: | ||
| description: | | ||
| Channel containing a gff file with Pathofact2 annotation | ||
| Structure: [ val(meta), path("*.gff") ] | ||
| meta: | ||
| type: map | ||
| description: | | ||
| Groovy Map containing sample information | ||
| e.g. `[ id:'sample1' ]` | ||
| "*.gff": | ||
| type: file | ||
| description: gff file with Pathofact2 annotation | ||
| pattern: "*.gff" | ||
| - versions: | ||
| type: file | ||
| description: | | ||
| File containing software versions | ||
| Structure: [ path(versions.yml) ] | ||
| pattern: "versions.yml" | ||
|
|
||
| authors: | ||
| - "@Ales-ibt" | ||
| maintainers: | ||
| - "@Ales-ibt" |
Binary file added
BIN
+246 Bytes
subworkflows/ebi-metagenomics/pathofact2/tests/data/negative_test.faa.gz
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+1.36 KB
subworkflows/ebi-metagenomics/pathofact2/tests/data/test_protein.faa.gz
Binary file not shown.
Binary file added
BIN
+287 Bytes
subworkflows/ebi-metagenomics/pathofact2/tests/data/test_protein.gff.gz
Binary file not shown.
148 changes: 148 additions & 0 deletions
148
subworkflows/ebi-metagenomics/pathofact2/tests/main.nf.test
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,148 @@ | ||
| nextflow_workflow { | ||
|
|
||
| name "Test Subworkflow PATHOFACT2" | ||
| script "../main.nf" | ||
| workflow "PATHOFACT2" | ||
| config "./nextflow.config" | ||
|
|
||
| tag "subworkflows" | ||
| tag "subworkflows_ebimetagenomics" | ||
| tag "subworkflows/pathofact2" | ||
| tag "pathofact2/toxins" | ||
| tag "pathofact2/downloaddata" | ||
| tag "pathofact2/virulence" | ||
| tag "pathofact2/integrator" | ||
| tag "pathofact2/extractfasta" | ||
| tag "nf-core/localcdsearch/annotate" | ||
| tag "nf-core/localcdsearch/download" | ||
| tag "nf-core/diamond/blastp" | ||
| tag "nf-core/diamond/makedb" | ||
| tag "nf-core/wget" | ||
|
|
||
| setup { | ||
| nfcoreInitialise("${launchDir}/library/") | ||
| nfcoreInstall( | ||
| "${launchDir}/library/", | ||
| [ | ||
| "localcdsearch/annotate", | ||
| "localcdsearch/download", | ||
| "diamond/blastp", | ||
| "diamond/makedb", | ||
| "wget", | ||
| ] | ||
| ) | ||
| nfcoreLink("${launchDir}/library/", "${baseDir}/modules/") | ||
| } | ||
|
|
||
| test("proteins_test - positive prediction - ips") { | ||
|
|
||
| tag "positive_ips" | ||
|
|
||
| when { | ||
| workflow { | ||
| """ | ||
| input[0] = channel.of([ | ||
| [id:'test'], | ||
| file("${moduleDir}/tests/data/test_protein.faa.gz", checkIfExists: true), | ||
| file("${moduleDir}/tests/data/test_protein.gff.gz", checkIfExists: true), | ||
| file("${moduleDir}/tests/data/test_ips.tsv.gz", checkIfExists: true) | ||
| ]) | ||
| input[1] = null | ||
| input[2] = null | ||
| input[3] = null | ||
| input[4] = '18223764' | ||
| input[5] = channel.of([ | ||
| [id:'VFDB_setB_pro'], | ||
| 'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz' | ||
| ]) | ||
| """ | ||
| } | ||
| } | ||
| then { | ||
| assert workflow.success | ||
| assert workflow.trace.tasks().size() > 0 | ||
|
|
||
| assertAll( | ||
| { assert snapshot( | ||
| workflow.out.gff, | ||
| workflow.out.versions | ||
| ).match() } | ||
| ) | ||
| } | ||
| } | ||
|
|
||
| test("proteins_test - positive prediction - no ips, run cdd") { | ||
|
|
||
| tag "positive_cdd" | ||
|
|
||
| when { | ||
| workflow { | ||
| """ | ||
| input[0] = channel.of([ | ||
| [id:'test'], | ||
| file("${moduleDir}/tests/data/test_protein.faa.gz", checkIfExists: true), | ||
| file("${moduleDir}/tests/data/test_protein.gff.gz", checkIfExists: true), | ||
| null | ||
| ]) | ||
| input[1] = null | ||
| input[2] = null | ||
| input[3] = null | ||
| input[4] = '18223764' | ||
| input[5] = channel.of([ | ||
| [id:'VFDB_setB_pro'], | ||
| 'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz' | ||
| ]) | ||
| """ | ||
| } | ||
| } | ||
| then { | ||
| assert workflow.success | ||
| assert workflow.trace.tasks().size() > 0 | ||
|
|
||
| assertAll( | ||
| { assert snapshot( | ||
| workflow.out.gff, | ||
| workflow.out.versions | ||
| ).match() } | ||
| ) | ||
| } | ||
| } | ||
|
|
||
| test("proteins_test - negative prediction - ips") { | ||
|
|
||
| tag "negative_ips" | ||
|
|
||
| when { | ||
| workflow { | ||
| """ | ||
| input[0] = channel.of([ | ||
| [id:'test'], | ||
| file("${moduleDir}/tests/data/negative_test.faa.gz", checkIfExists: true), | ||
| file("${moduleDir}/tests/data/test_protein.gff.gz", checkIfExists: true), | ||
| file("${moduleDir}/tests/data/test_ips.tsv.gz", checkIfExists: true) | ||
| ]) | ||
| input[1] = null | ||
| input[2] = null | ||
| input[3] = null | ||
| input[4] = '18223764' | ||
| input[5] = channel.of([ | ||
| [id:'VFDB_setB_pro'], | ||
| 'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz' | ||
| ]) | ||
| """ | ||
| } | ||
| } | ||
| then { | ||
| assert workflow.success | ||
| assert workflow.trace.tasks().size() > 0 | ||
|
|
||
| assertAll( | ||
| { assert snapshot( | ||
| workflow.out.gff, | ||
| workflow.out.versions | ||
| ).match() } | ||
| ) | ||
| } | ||
| } | ||
|
|
||
| } |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.