Skip to content
Merged
115 changes: 115 additions & 0 deletions subworkflows/ebi-metagenomics/pathofact2/main.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
// Subworkflow to generate toxins and virulence factors annotation from protein sequences
// Outputs are filtered by threshold and integrated into a single GFF3 format output
include { PATHOFACT2_DOWNLOADDATA } from '../../../modules/ebi-metagenomics/pathofact2/downloaddata/main'
include { PATHOFACT2_TOXINS } from '../../../modules/ebi-metagenomics/pathofact2/toxins/main'
include { PATHOFACT2_VIRULENCE } from '../../../modules/ebi-metagenomics/pathofact2/virulence/main'
include { PATHOFACT2_INTEGRATOR } from '../../../modules/ebi-metagenomics/pathofact2/integrator/main'
include { PATHOFACT2_EXTRACTFASTA } from '../../../modules/ebi-metagenomics/pathofact2/extractfasta/main'
include { LOCALCDSEARCH_ANNOTATE } from '../../../modules/nf-core/localcdsearch/annotate/main'
include { LOCALCDSEARCH_DOWNLOAD } from '../../../modules/nf-core/localcdsearch/download/main'
include { DIAMOND_BLASTP } from '../../../modules/nf-core/diamond/blastp/main'
include { DIAMOND_MAKEDB } from '../../../modules/nf-core/diamond/makedb/main'
include { WGET } from '../../../modules/nf-core/wget/main'

workflow PATHOFACT2 {
take:
ch_inputs // channel: tuple( val(meta), path(aminoacids), path(cds_gff), path(ips_tsv) )
ch_models // channel: path( pathofact2_db )
ch_vfdb // channel: path( vfdb )
ch_cdd // channel: path( cdd_db )
ch_zenodo_id // channel: value( pathofact2_db_zenodo_id )
ch_vfdb_url // channel: tuple( val(meta2), val(vfdb_url) )

main:
ch_versions = channel.empty()

// Extract individual components from input channel
ch_faa = ch_inputs.map{ meta, aminoacids, _cds_gff, _ips_tsv -> tuple(meta, aminoacids) }
ch_gff = ch_inputs.map{ meta, _aminoacids, cds_gff, _ips_tsv -> tuple(meta, cds_gff) }
ch_ips = ch_inputs.map{ meta, _aminoacids, _cds_gff, ips_tsv -> tuple(meta, ips_tsv) }

// Split inputs based on whether IPS annotation is provided
ch_ips
.branch { meta, ips_tsv ->
with_ips: ips_tsv
return tuple(meta, ips_tsv)
without_ips: !ips_tsv
return meta
}
.set { ch_ips_branched }

ch_with_ips = ch_ips_branched.with_ips
ch_without_ips = ch_ips_branched.without_ips

// Preparing databases
if (ch_models) {
pathofact_models = ch_models
} else {
PATHOFACT2_DOWNLOADDATA(ch_zenodo_id)
pathofact_models = PATHOFACT2_DOWNLOADDATA.out.zenodo_file
}

if (ch_vfdb) {
vfdb_diamond_db = ch_vfdb
} else {
WGET(ch_vfdb_url)
ch_versions = ch_versions.mix(WGET.out.versions.first())
DIAMOND_MAKEDB(WGET.out.outfile, [], [], [])
ch_versions = ch_versions.mix(DIAMOND_MAKEDB.out.versions.first())
vfdb_diamond_db = DIAMOND_MAKEDB.out.db
}

// Prepare CDD database (will only be used if ch_without_ips has data)
if (ch_cdd) {
cdd_database = ch_cdd
} else {
LOCALCDSEARCH_DOWNLOAD(['cdd_ncbi'])
cdd_database = LOCALCDSEARCH_DOWNLOAD.out.db
}

// Running prediction
PATHOFACT2_TOXINS( ch_faa, pathofact_models )

PATHOFACT2_VIRULENCE( ch_faa, pathofact_models )

// Searching for hits in VFDB
DIAMOND_BLASTP( ch_faa, vfdb_diamond_db, 6, 'qseqid sseqid pident length qlen slen evalue bitscore')
ch_versions = ch_versions.mix(DIAMOND_BLASTP.out.versions.first())

// Extracting positive matches
ch_extractfasta_input = ch_faa
.join(DIAMOND_BLASTP.out.txt)
.join(PATHOFACT2_TOXINS.out.tsv)
.join(PATHOFACT2_VIRULENCE.out.tsv)
PATHOFACT2_EXTRACTFASTA(ch_extractfasta_input)

// Running annotation using local-cd-search when ips_tsv is not provided
ch_fasta_for_cdd = PATHOFACT2_EXTRACTFASTA.out.fasta
.join(ch_without_ips)
LOCALCDSEARCH_ANNOTATE(ch_fasta_for_cdd, cdd_database, false)

// Combine IPS annotations with CDD annotations
prot_annot = ch_with_ips.mix(LOCALCDSEARCH_ANNOTATE.out.result)

// Set annotation type based on source
annot_type = ch_with_ips
.map { meta, _ips_tsv -> tuple(meta, 'ips') }
.mix(
LOCALCDSEARCH_ANNOTATE.out.result.map { meta, _annot -> tuple(meta, 'cdd') }
)

// Integrating results in a single gff file
ch_for_integrator = ch_gff
.join(prot_annot)
.join(PATHOFACT2_EXTRACTFASTA.out.tsv)
.join(annot_type)
PATHOFACT2_INTEGRATOR(ch_for_integrator)

// Handle cases where no predictions are made (integrator produces no output)
ch_gff_output = PATHOFACT2_INTEGRATOR.out.gff.ifEmpty([])

emit:
gff = ch_gff_output // channel: tuple( val(meta), path(gff) )
versions = ch_versions // channel: [ versions.yml ]

}
89 changes: 89 additions & 0 deletions subworkflows/ebi-metagenomics/pathofact2/meta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# yaml-language-server: $schema=https://raw.githubusercontent.com/EBI-Metagenomics/nf-modules/main/subworkflows/yaml-schema.json
name: "pathofact2"
description: Pipeline for predicting virulence Factors and toxins in protein sequences
keywords:
- pathofact2
- virulence
- toxin
- vfdb
- cdc
- gff3
components:
- localcdsearch/annotate:
git_remote: https://github.com/nf-core/modules.git
- localcdsearch/download:
git_remote: https://github.com/nf-core/modules.git
- diamond/blastp:
git_remote: https://github.com/nf-core/modules.git
- diamond/makedb:
git_remote: https://github.com/nf-core/modules.git
- wget:
git_remote: https://github.com/nf-core/modules.git
- pathofact2/downloaddata
- pathofact2/toxins
- pathofact2/virulence
- pathofact2/integrator
- pathofact2/extractfasta

input:
- meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
ch_inputs:
type: file
description: |
The input channel containing the protein fasta file, the gff file and the interproscan tsv file
e.g. `[ [meta], aminoacids, cds_gff, ips_tsv ]`
- ch_models:
type: directory
description: |
The input channel containing the path to the Pathofact2 models database directory
- ch_vfdb:
type: directory
description: |
The input channel containing the path to the Virulence Factors Database (VFDB) directory
- ch_cdd:
type: directory
description: |
The input channel containing the path to Conserved Domains Database (CDD) directory
- ch_zenodo_id:
type: string
description: |
The id of the pathofact database in zenodo
- meta2:
type: map
description: |
Groovy Map containing the VFDB databse id
e.g. `[ id:'VFDB_setB_pro' ]`
ch_vfdb_url:
type: string
description: |
The url to download VFDB using wget
e.g. `[[id:'VFDB_setB_pro'], 'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz']`
output:
- gff:
description: |
Channel containing a gff file with Pathofact2 annotation
Structure: [ val(meta), path("*.gff") ]
meta:
type: map
description: |
Groovy Map containing sample information
e.g. `[ id:'sample1' ]`
"*.gff":
type: file
description: gff file with Pathofact2 annotation
pattern: "*.gff"
- versions:
type: file
description: |
File containing software versions
Structure: [ path(versions.yml) ]
pattern: "versions.yml"

authors:
- "@Ales-ibt"
maintainers:
- "@Ales-ibt"
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
148 changes: 148 additions & 0 deletions subworkflows/ebi-metagenomics/pathofact2/tests/main.nf.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
nextflow_workflow {

name "Test Subworkflow PATHOFACT2"
script "../main.nf"
workflow "PATHOFACT2"
config "./nextflow.config"

tag "subworkflows"
tag "subworkflows_ebimetagenomics"
tag "subworkflows/pathofact2"
tag "pathofact2/toxins"
tag "pathofact2/downloaddata"
tag "pathofact2/virulence"
tag "pathofact2/integrator"
tag "pathofact2/extractfasta"
tag "nf-core/localcdsearch/annotate"
tag "nf-core/localcdsearch/download"
tag "nf-core/diamond/blastp"
tag "nf-core/diamond/makedb"
tag "nf-core/wget"

setup {
nfcoreInitialise("${launchDir}/library/")
nfcoreInstall(
"${launchDir}/library/",
[
"localcdsearch/annotate",
"localcdsearch/download",
"diamond/blastp",
"diamond/makedb",
"wget",
]
)
nfcoreLink("${launchDir}/library/", "${baseDir}/modules/")
}

test("proteins_test - positive prediction - ips") {

tag "positive_ips"

when {
workflow {
"""
input[0] = channel.of([
[id:'test'],
file("${moduleDir}/tests/data/test_protein.faa.gz", checkIfExists: true),
file("${moduleDir}/tests/data/test_protein.gff.gz", checkIfExists: true),
file("${moduleDir}/tests/data/test_ips.tsv.gz", checkIfExists: true)
])
input[1] = null
input[2] = null
input[3] = null
input[4] = '18223764'
input[5] = channel.of([
[id:'VFDB_setB_pro'],
'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz'
])
"""
}
}
then {
assert workflow.success
assert workflow.trace.tasks().size() > 0

assertAll(
{ assert snapshot(
workflow.out.gff,
workflow.out.versions
).match() }
)
}
}

test("proteins_test - positive prediction - no ips, run cdd") {

tag "positive_cdd"

when {
workflow {
"""
input[0] = channel.of([
[id:'test'],
file("${moduleDir}/tests/data/test_protein.faa.gz", checkIfExists: true),
file("${moduleDir}/tests/data/test_protein.gff.gz", checkIfExists: true),
null
])
input[1] = null
input[2] = null
input[3] = null
input[4] = '18223764'
input[5] = channel.of([
[id:'VFDB_setB_pro'],
'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz'
])
"""
}
}
then {
assert workflow.success
assert workflow.trace.tasks().size() > 0

assertAll(
{ assert snapshot(
workflow.out.gff,
workflow.out.versions
).match() }
)
}
}

test("proteins_test - negative prediction - ips") {

tag "negative_ips"

when {
workflow {
"""
input[0] = channel.of([
[id:'test'],
file("${moduleDir}/tests/data/negative_test.faa.gz", checkIfExists: true),
file("${moduleDir}/tests/data/test_protein.gff.gz", checkIfExists: true),
file("${moduleDir}/tests/data/test_ips.tsv.gz", checkIfExists: true)
])
input[1] = null
input[2] = null
input[3] = null
input[4] = '18223764'
input[5] = channel.of([
[id:'VFDB_setB_pro'],
'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz'
])
"""
}
}
then {
assert workflow.success
assert workflow.trace.tasks().size() > 0

assertAll(
{ assert snapshot(
workflow.out.gff,
workflow.out.versions
).match() }
)
}
}

}
Loading