EBI-Metagenomics · Ales-ibt · Feb 5, 2026 · Feb 3, 2026 · Feb 3, 2026 · Feb 4, 2026
diff --git a/subworkflows/ebi-metagenomics/pathofact2/main.nf b/subworkflows/ebi-metagenomics/pathofact2/main.nf
@@ -0,0 +1,115 @@
+// Subworkflow to generate toxins and virulence factors annotation from protein sequences
+// Outputs are filtered by threshold and integrated into a single GFF3 format output
+include { PATHOFACT2_DOWNLOADDATA  } from '../../../modules/ebi-metagenomics/pathofact2/downloaddata/main'
+include { PATHOFACT2_TOXINS        } from '../../../modules/ebi-metagenomics/pathofact2/toxins/main'
+include { PATHOFACT2_VIRULENCE     } from '../../../modules/ebi-metagenomics/pathofact2/virulence/main'
+include { PATHOFACT2_INTEGRATOR    } from '../../../modules/ebi-metagenomics/pathofact2/integrator/main'
+include { PATHOFACT2_EXTRACTFASTA  } from '../../../modules/ebi-metagenomics/pathofact2/extractfasta/main'
+include { LOCALCDSEARCH_ANNOTATE   } from '../../../modules/nf-core/localcdsearch/annotate/main'
+include { LOCALCDSEARCH_DOWNLOAD   } from '../../../modules/nf-core/localcdsearch/download/main'
+include { DIAMOND_BLASTP           } from '../../../modules/nf-core/diamond/blastp/main'
+include { DIAMOND_MAKEDB           } from '../../../modules/nf-core/diamond/makedb/main'
+include { WGET                     } from '../../../modules/nf-core/wget/main'
+
+workflow PATHOFACT2 {
+    take:
+    ch_inputs          // channel: tuple( val(meta), path(aminoacids), path(cds_gff), path(ips_tsv) )
+    ch_models          // channel: path( pathofact2_db )
+    ch_vfdb            // channel: path( vfdb )
+    ch_cdd             // channel: path( cdd_db )
+    ch_zenodo_id        // channel: value( pathofact2_db_zenodo_id )
+    ch_vfdb_url        // channel: tuple( val(meta2), val(vfdb_url) )
+
+    main:
+    ch_versions = channel.empty()
+
+    // Extract individual components from input channel
+    ch_faa = ch_inputs.map{ meta, aminoacids, _cds_gff, _ips_tsv -> tuple(meta, aminoacids) }
+    ch_gff = ch_inputs.map{ meta, _aminoacids, cds_gff, _ips_tsv -> tuple(meta, cds_gff) }
+    ch_ips = ch_inputs.map{ meta, _aminoacids, _cds_gff, ips_tsv -> tuple(meta, ips_tsv) }
+
+    // Split inputs based on whether IPS annotation is provided
+    ch_ips
+        .branch { meta, ips_tsv ->
+            with_ips: ips_tsv
+                return tuple(meta, ips_tsv)
+            without_ips: !ips_tsv
+                return meta
+        }
+        .set { ch_ips_branched }
+
+    ch_with_ips = ch_ips_branched.with_ips
+    ch_without_ips = ch_ips_branched.without_ips
+
+    // Preparing databases
+    if (ch_models) {
+        pathofact_models = ch_models
+    } else {
+        PATHOFACT2_DOWNLOADDATA(ch_zenodo_id)
+        pathofact_models = PATHOFACT2_DOWNLOADDATA.out.zenodo_file
+    }
+
+    if (ch_vfdb) {
+        vfdb_diamond_db = ch_vfdb
+    } else {
+        WGET(ch_vfdb_url)
+        ch_versions = ch_versions.mix(WGET.out.versions.first())
+        DIAMOND_MAKEDB(WGET.out.outfile, [], [], [])
+        ch_versions = ch_versions.mix(DIAMOND_MAKEDB.out.versions.first())
+        vfdb_diamond_db = DIAMOND_MAKEDB.out.db
+    }
+
+    // Prepare CDD database (will only be used if ch_without_ips has data)
+    if (ch_cdd) {
+        cdd_database = ch_cdd
+    } else {
+        LOCALCDSEARCH_DOWNLOAD(['cdd_ncbi'])
+        cdd_database = LOCALCDSEARCH_DOWNLOAD.out.db
+    }
+
+    // Running prediction
+    PATHOFACT2_TOXINS( ch_faa, pathofact_models )
+
+    PATHOFACT2_VIRULENCE( ch_faa, pathofact_models )
+
+    // Searching for hits in VFDB
+    DIAMOND_BLASTP( ch_faa, vfdb_diamond_db, 6, 'qseqid sseqid pident length qlen slen evalue bitscore')
+    ch_versions = ch_versions.mix(DIAMOND_BLASTP.out.versions.first())
+
+    // Extracting positive matches
+    ch_extractfasta_input = ch_faa
+        .join(DIAMOND_BLASTP.out.txt)
+        .join(PATHOFACT2_TOXINS.out.tsv)
+        .join(PATHOFACT2_VIRULENCE.out.tsv)
+    PATHOFACT2_EXTRACTFASTA(ch_extractfasta_input)
+
+    // Running annotation using local-cd-search when ips_tsv is not provided
+    ch_fasta_for_cdd = PATHOFACT2_EXTRACTFASTA.out.fasta
+        .join(ch_without_ips)
+    LOCALCDSEARCH_ANNOTATE(ch_fasta_for_cdd, cdd_database, false)
+
+    // Combine IPS annotations with CDD annotations
+    prot_annot = ch_with_ips.mix(LOCALCDSEARCH_ANNOTATE.out.result)
+
+    // Set annotation type based on source
+    annot_type = ch_with_ips
+        .map { meta, _ips_tsv -> tuple(meta, 'ips') }
+        .mix(
+            LOCALCDSEARCH_ANNOTATE.out.result.map { meta, _annot -> tuple(meta, 'cdd') }
+        )
+
+    // Integrating results in a single gff file
+    ch_for_integrator = ch_gff
+        .join(prot_annot)
+        .join(PATHOFACT2_EXTRACTFASTA.out.tsv)
+        .join(annot_type)
+    PATHOFACT2_INTEGRATOR(ch_for_integrator)
+
+    // Handle cases where no predictions are made (integrator produces no output)
+    ch_gff_output = PATHOFACT2_INTEGRATOR.out.gff.ifEmpty([])
+
+    emit:
+    gff  =  ch_gff_output            // channel: tuple( val(meta), path(gff) )
+    versions = ch_versions           // channel: [ versions.yml ]
+
+}
diff --git a/subworkflows/ebi-metagenomics/pathofact2/meta.yml b/subworkflows/ebi-metagenomics/pathofact2/meta.yml
@@ -0,0 +1,89 @@
+# yaml-language-server: $schema=https://raw.githubusercontent.com/EBI-Metagenomics/nf-modules/main/subworkflows/yaml-schema.json
+name: "pathofact2"
+description: Pipeline for predicting virulence Factors and toxins in protein sequences
+keywords:
+  - pathofact2
+  - virulence
+  - toxin
+  - vfdb
+  - cdc
+  - gff3
+components:
+  - localcdsearch/annotate:
+      git_remote: https://github.com/nf-core/modules.git
+  - localcdsearch/download:
+      git_remote: https://github.com/nf-core/modules.git
+  - diamond/blastp:
+      git_remote: https://github.com/nf-core/modules.git
+  - diamond/makedb:
+      git_remote: https://github.com/nf-core/modules.git
+  - wget:
+      git_remote: https://github.com/nf-core/modules.git
+  - pathofact2/downloaddata
+  - pathofact2/toxins
+  - pathofact2/virulence
+  - pathofact2/integrator
+  - pathofact2/extractfasta
+
+input:
+  - meta:
+      type: map
+      description: |
+        Groovy Map containing sample information
+        e.g. `[ id:'sample1' ]`
+    ch_inputs:
+      type: file
+      description: |
+        The input channel containing the protein fasta file, the gff file and the interproscan tsv file
+        e.g. `[ [meta], aminoacids, cds_gff, ips_tsv ]`
+  - ch_models:
+      type: directory
+      description: |
+        The input channel containing the path to the Pathofact2 models database directory
+  - ch_vfdb:
+      type: directory
+      description: |
+        The input channel containing the path to the Virulence Factors Database (VFDB) directory
+  - ch_cdd:
+      type: directory
+      description: |
+        The input channel containing the path to Conserved Domains Database (CDD) directory
+  - ch_zenodo_id:
+      type: string
+      description: |
+        The id of the pathofact database in zenodo
+  - meta2:
+      type: map
+      description: |
+        Groovy Map containing the VFDB databse id
+        e.g. `[ id:'VFDB_setB_pro' ]`
+    ch_vfdb_url:
+      type: string
+      description: |
+        The url to download VFDB using wget
+        e.g. `[[id:'VFDB_setB_pro'], 'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz']`
+output:
+  - gff:
+      description: |
+        Channel containing a gff file with Pathofact2 annotation
+        Structure: [ val(meta), path("*.gff") ]
+      meta:
+        type: map
+        description: |
+          Groovy Map containing sample information
+          e.g. `[ id:'sample1' ]`
+      "*.gff":
+        type: file
+        description: gff file with Pathofact2 annotation
+        pattern: "*.gff"
+  - versions:
+      type: file
+      description: |
+        File containing software versions
+        Structure: [ path(versions.yml) ]
+      pattern: "versions.yml"
+
+authors:
+  - "@Ales-ibt"
+maintainers:
+  - "@Ales-ibt"
diff --git a/subworkflows/ebi-metagenomics/pathofact2/tests/data/negative_test.faa.gz b/subworkflows/ebi-metagenomics/pathofact2/tests/data/negative_test.faa.gz
diff --git a/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_ips.tsv.gz b/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_ips.tsv.gz
diff --git a/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_protein.faa.gz b/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_protein.faa.gz
diff --git a/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_protein.gff.gz b/subworkflows/ebi-metagenomics/pathofact2/tests/data/test_protein.gff.gz
diff --git a/subworkflows/ebi-metagenomics/pathofact2/tests/main.nf.test b/subworkflows/ebi-metagenomics/pathofact2/tests/main.nf.test
@@ -0,0 +1,148 @@
+nextflow_workflow {
+
+    name "Test Subworkflow PATHOFACT2"
+    script "../main.nf"
+    workflow "PATHOFACT2"
+    config "./nextflow.config"
+
+    tag "subworkflows"
+    tag "subworkflows_ebimetagenomics"
+    tag "subworkflows/pathofact2"
+    tag "pathofact2/toxins"
+    tag "pathofact2/downloaddata"
+    tag "pathofact2/virulence"
+    tag "pathofact2/integrator"
+    tag "pathofact2/extractfasta"
+    tag "nf-core/localcdsearch/annotate"
+    tag "nf-core/localcdsearch/download"
+    tag "nf-core/diamond/blastp"
+    tag "nf-core/diamond/makedb"
+    tag "nf-core/wget"
+
+    setup {
+        nfcoreInitialise("${launchDir}/library/")
+        nfcoreInstall(
+            "${launchDir}/library/",
+            [
+                "localcdsearch/annotate",
+                "localcdsearch/download",
+                "diamond/blastp",
+                "diamond/makedb",
+                "wget",
+            ]
+        )
+        nfcoreLink("${launchDir}/library/", "${baseDir}/modules/")
+    }
+
+    test("proteins_test - positive prediction - ips") {
+
+        tag "positive_ips"
+
+        when {
+            workflow {
+                """
+                input[0] = channel.of([
+                    [id:'test'],
+                    file("${moduleDir}/tests/data/test_protein.faa.gz", checkIfExists: true),
+                    file("${moduleDir}/tests/data/test_protein.gff.gz", checkIfExists: true),
+                    file("${moduleDir}/tests/data/test_ips.tsv.gz", checkIfExists: true)
+                ])
+                input[1] = null
+                input[2] = null
+                input[3] = null
+                input[4] = '18223764'
+                input[5] = channel.of([
+                    [id:'VFDB_setB_pro'],
+                    'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz'
+                ])
+                """
+            }
+        }
+        then {
+            assert workflow.success
+            assert workflow.trace.tasks().size() > 0
+
+            assertAll(
+                { assert snapshot(
+                    workflow.out.gff,
+                    workflow.out.versions
+                ).match() }
+            )
+        }
+    }
+
+    test("proteins_test - positive prediction - no ips, run cdd") {
+
+        tag "positive_cdd"
+
+        when {
+            workflow {
+                """
+                input[0] = channel.of([
+                    [id:'test'],
+                    file("${moduleDir}/tests/data/test_protein.faa.gz", checkIfExists: true),
+                    file("${moduleDir}/tests/data/test_protein.gff.gz", checkIfExists: true),
+                    null
+                ])
+                input[1] = null
+                input[2] = null
+                input[3] = null
+                input[4] = '18223764'
+                input[5] = channel.of([
+                    [id:'VFDB_setB_pro'],
+                    'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz'
+                ])
+                """
+            }
+        }
+        then {
+            assert workflow.success
+            assert workflow.trace.tasks().size() > 0
+
+            assertAll(
+                { assert snapshot(
+                    workflow.out.gff,
+                    workflow.out.versions
+                ).match() }
+            )
+        }
+    }
+
+    test("proteins_test - negative prediction - ips") {
+
+        tag "negative_ips"
+
+        when {
+            workflow {
+                """
+                input[0] = channel.of([
+                    [id:'test'],
+                    file("${moduleDir}/tests/data/negative_test.faa.gz", checkIfExists: true),
+                    file("${moduleDir}/tests/data/test_protein.gff.gz", checkIfExists: true),
+                    file("${moduleDir}/tests/data/test_ips.tsv.gz", checkIfExists: true)
+                ])
+                input[1] = null
+                input[2] = null
+                input[3] = null
+                input[4] = '18223764'
+                input[5] = channel.of([
+                    [id:'VFDB_setB_pro'],
+                    'https://www.mgc.ac.cn/VFs/Down/VFDB_setB_pro.fas.gz'
+                ])
+                """
+            }
+        }
+        then {
+            assert workflow.success
+            assert workflow.trace.tasks().size() > 0
+
+            assertAll(
+                { assert snapshot(
+                    workflow.out.gff,
+                    workflow.out.versions
+                ).match() }
+            )
+        }
+    }
+
+}