EBI-Metagenomics
diff --git a/‎bin/filter_ips_by_contigs.py‎
Lines changed: 173 additions & 0 deletions b/‎bin/filter_ips_by_contigs.py‎
Lines changed: 173 additions & 0 deletions
diff --git a/‎conf/modules.config‎
Lines changed: 4 additions & 1 deletion b/‎conf/modules.config‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎modules/local/filter_ips_and_faa_by_contigs.nf‎
Lines changed: 49 additions & 0 deletions b/‎modules/local/filter_ips_and_faa_by_contigs.nf‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎subworkflows/local/pathways_and_systems.nf‎
Lines changed: 21 additions & 11 deletions b/‎subworkflows/local/pathways_and_systems.nf‎
Lines changed: 21 additions & 11 deletions
diff --git a/‎tests/assembly_erz101.fasta.gz‎
5.72 KB b/‎tests/assembly_erz101.fasta.gz‎
5.72 KB
@@ -0,0 +1,173 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2026 EMBL - European Bioinformatics Institute
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import argparse
+import gzip
+import logging
+import sys
+from pathlib import Path
+from typing import Iterator
+
+logging.basicConfig(
+    format="%(levelname)s: %(message)s",
+    level=logging.INFO,
+    stream=sys.stderr,
+)
+
+
+def _open(path: Path | str):
+    """Open a plain or gzip-compressed text file for reading.
+
+    :param path: Path to the file.
+    :return: A text-mode file object.
+    """
+    if Path(path).suffix == ".gz":
+        return gzip.open(path, "rt")
+    return open(path, "r")
+
+
+def _contig_ids_from_fasta(fasta_path: Path) -> set[str]:
+    """Parse a FASTA file and return the set of sequence IDs.
+
+    Only the first whitespace-delimited token of each header line is used,
+    matching the behaviour of seqkit and other common tools.
+
+    :param fasta_path: Path to the FASTA file (plain or gzip).
+    :return: Set of contig IDs (without the leading '>').
+    """
+    ids: set[str] = set()
+    with _open(fasta_path) as handle:
+        for line in handle:
+            if line.startswith(">"):
+                ids.add(line[1:].split()[0])
+
+    if not ids:
+        logging.error(f"No sequences found in {fasta_path}")
+        sys.exit(1)
+
+    return ids
+
+
+def _contig_id_from_protein_id(protein_id: str, contig_ids: set[str]) -> str | None:
+    """Derive the contig ID from a protein ID, handling both CGC callers.
+
+    The CGC produces two protein ID formats:
+
+    - Pyrodigal: ``{contig_id}_{cds_n}`` — strip the last component.
+    - FragGeneScanRS: ``{contig_id}_{start}_{end}_{strand}`` — strip the last three.
+
+    Both are tried in order; the first match against *contig_ids* is returned.
+    Returns ``None`` if neither resolves to a known contig.
+
+    :param protein_id: Protein identifier from the IPS TSV.
+    :param contig_ids: Set of known contig IDs.
+    :return: Matching contig ID, or ``None``.
+    """
+    for candidate in (protein_id.rsplit("_", 1)[0], protein_id.rsplit("_", 3)[0]):
+        if candidate in contig_ids:
+            return candidate
+    return None
+
+
+def _iter_filtered_rows(
+    ips_path: Path,
+    contig_ids: set[str],
+) -> Iterator[tuple[str, str]]:
+    """Stream IPS TSV rows that belong to contigs in *contig_ids*.
+
+    The protein ID is column 1 (0-indexed: column 0) of the tab-separated file.
+    Both Pyrodigal (``{contig_id}_{cds_n}``) and FragGeneScanRS
+    (``{contig_id}_{start}_{end}_{strand}``) protein ID formats are handled.
+
+    :param ips_path: Path to the InterProScan TSV (plain or gzip).
+    :param contig_ids: Set of contig IDs to keep.
+    :return: Iterator of ``(protein_id, raw_line)`` tuples for matching rows.
+    :raises SystemExit: If a malformed row (no tab separator) is encountered.
+    """
+    with _open(ips_path) as handle:
+        for lineno, line in enumerate(handle, start=1):
+            if not line.strip() or line.startswith("#"):
+                continue
+            parts = line.split("\t", 1)
+            if len(parts) < 2:
+                logging.error(
+                    f"Malformed IPS row at line {lineno} (no tab separator): {line.rstrip()}"
+                )
+                sys.exit(1)
+            protein_id = parts[0]
+            contig_id = _contig_id_from_protein_id(protein_id, contig_ids)
+            if contig_id is not None:
+                yield protein_id, line
+
+
+def filter_ips(
+    contigs_path: Path,
+    ips_path: Path,
+    out_path: Path,
+) -> None:
+    """Filter *ips_path* to rows whose proteins belong to the provided contigs.
+
+    Each matched protein ID is written to stdout (one per line) so the caller
+    can pipe them into ``seqkit grep -f -``.
+
+    :param contigs_path: FASTA file containing the contig subset.
+    :param ips_path: Full InterProScan TSV to filter.
+    :param out_path: Destination for the filtered TSV (written as gzip).
+    """
+    contig_ids = _contig_ids_from_fasta(contigs_path)
+
+    with gzip.open(out_path, "wt") as out_fh:
+        for protein_id, line in _iter_filtered_rows(ips_path, contig_ids):
+            out_fh.write(line)
+            sys.stdout.write(protein_id + "\n")
+    sys.stdout.flush()
+
+
+if __name__ == "__main__":
+    """Filter an InterProScan TSV to rows whose proteins belong to a given set of contigs.
+
+    Matched protein IDs are written to stdout so the caller can pipe them directly
+    into ``seqkit grep -f -`` to subset a FAA file without creating a temporary file.
+    """
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--contigs",
+        required=True,
+        type=Path,
+        metavar="FASTA",
+        help="Contig chunk FASTA (plain or gzip).",
+    )
+    parser.add_argument(
+        "--ips",
+        required=True,
+        type=Path,
+        metavar="TSV",
+        help="Full InterProScan TSV (plain or gzip).",
+    )
+    parser.add_argument(
+        "--out",
+        required=True,
+        type=Path,
+        metavar="TSV_GZ",
+        help="Output filtered TSV (written as gzip).",
+    )
+    args = parser.parse_args()
+    filter_ips(
+        contigs_path=args.contigs,
+        ips_path=args.ips,
+        out_path=args.out,
+    )
@@ -477,11 +477,14 @@ process {
         cpus = 4
         // TODO: we will tweak this one
         memory = { 80.GB * task.attempt }
+    }
 
+    withName: CONCATENATE_SANNTIS_GFFS {
+        ext.prefix = { "${meta.id}_sanntis" }
         publishDir = [
             path: { "${params.outdir}/${meta.id}/pathways-and-systems/sanntis" },
             mode: params.publish_dir_mode,
-            saveAs: { filename -> filename.equals('versions.yml') ? null : filename },
+            saveAs: { filename -> filename.equals('versions.yml') ? null : filename.replaceAll('_concatenated', '') },
         ]
     }
 
 
@@ -0,0 +1,49 @@
+process FILTER_IPS_AND_FAA_BY_CONTIGS {
+
+    tag "$meta.id"
+    label 'process_medium'
+
+    container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+        'oras://community.wave.seqera.io/library/seqkit_python:3dcb98fc808deba4' :
+        'community.wave.seqera.io/library/seqkit_python:fa0b5f9a20082dfc' }"
+
+    input:
+    tuple val(meta), path(contigs_chunk), path(ips_tsv), path(faa)
+
+    output:
+    tuple val(meta), path("${prefix}_filtered_ips.tsv.gz"), path("${prefix}_filtered.faa.gz"), emit: filtered
+    path "versions.yml",                                                                       emit: versions
+
+    script:
+    prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    filter_ips_by_contigs.py \\
+        --contigs ${contigs_chunk} \\
+        --ips ${ips_tsv} \\
+        --out ${prefix}_filtered_ips.tsv.gz | \\
+    seqkit grep \\
+        --threads ${task.cpus} \\
+        -f - \\
+        ${faa} \\
+        -o ${prefix}_filtered.faa.gz
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        seqkit: \$( seqkit version | sed 's/seqkit v//' )
+        python: \$( python3 --version | sed 's/Python //' )
+    END_VERSIONS
+    """
+
+    stub:
+    prefix = task.ext.prefix ?: "${meta.id}"
+    """
+    echo "" | gzip > ${prefix}_filtered_ips.tsv.gz
+    echo "" | gzip > ${prefix}_filtered.faa.gz
+
+    cat <<-END_VERSIONS > versions.yml
+    "${task.process}":
+        seqkit: \$( seqkit version | sed 's/seqkit v//' )
+        python: \$( python3 --version | sed 's/Python //' )
+    END_VERSIONS
+    """
+}
@@ -14,9 +14,11 @@ include { KEGGPATHWAYSCOMPLETENESS     } from '../../modules/ebi-metagenomics/ke
 /* LOCAL */
 include { ANTISMASH_JSON_TO_GFF                          } from '../../modules/local/antismash_json_to_gff'
 include { CONCATENATE_GFFS as CONCATENATE_ANTISMASH_GFFS } from '../../modules/local/concatenate_gffs'
+include { CONCATENATE_GFFS as CONCATENATE_SANNTIS_GFFS   } from '../../modules/local/concatenate_gffs'
 include { ANTISMASH_SUMMARY                              } from '../../modules/local/antismash_summary'
 include { SANNTIS_SUMMARY                                } from '../../modules/local/sanntis_summary'
 include { MERGE_ANTISMASH_JSON                           } from '../../modules/local/merge_antismash_json'
+include { FILTER_IPS_AND_FAA_BY_CONTIGS                  } from '../../modules/local/filter_ips_and_faa_by_contigs'
 
 include { DRAM_DISTILL_SWF                               } from '../../subworkflows/local/dram_distill_swf'
 
@@ -118,22 +120,30 @@ workflow PATHWAYS_AND_SYSTEMS {
     )
     ch_versions = ch_versions.mix(ANTISMASH_SUMMARY.out.versions)
 
-    // Note: same weirdness as antismash_channel
-    ch_sanntis = ch_contigs_and_predicted_proteins.map { meta, _all_contigs_fasta, faa, _gff, ips_tsv ->
-        [meta, ips_tsv, [], faa]
-    }
+    // For each chunk we filter the full IPS TSV and FAA to only the proteins belonging
+    // to contigs in that chunk, then run SANNTIS in parallel and merge the results.
+    ch_sanntis_chunks = ch_contigs_and_predicted_proteins
+        .combine(SEQKIT_SPLIT2.out.assembly.transpose(), by: 0)
+        .map { meta, _all_contigs_fasta, faa, _gff, ips_tsv, contigs_chunk ->
+            [meta, contigs_chunk, ips_tsv, faa]
+        }
+
+    FILTER_IPS_AND_FAA_BY_CONTIGS(ch_sanntis_chunks)
+    ch_versions = ch_versions.mix(FILTER_IPS_AND_FAA_BY_CONTIGS.out.versions)
 
-    // We run SanntiS only once per assembly. To chunk it, we would need to ensure
-    // that each protein chunk contains annotations for only one contig. Otherwise,
-    // SanntiS might misannotate sequences, as there is no guarantee that all proteins
-    // from a single contig will be present in the same faa chunk.
     SANNTIS(
-        ch_sanntis
+        FILTER_IPS_AND_FAA_BY_CONTIGS.out.filtered
+            .map { meta, ips, faa -> [meta, ips, [], faa] }
     )
     ch_versions = ch_versions.mix(SANNTIS.out.versions)
 
+    CONCATENATE_SANNTIS_GFFS(
+        SANNTIS.out.gff.groupTuple()
+    )
+    ch_versions = ch_versions.mix(CONCATENATE_SANNTIS_GFFS.out.versions)
+
     SANNTIS_SUMMARY(
-        SANNTIS.out.gff
+        CONCATENATE_SANNTIS_GFFS.out.concatenated_gff
     )
     ch_versions = ch_versions.mix(SANNTIS_SUMMARY.out.versions)
 
@@ -150,6 +160,6 @@ workflow PATHWAYS_AND_SYSTEMS {
 
     emit:
     versions = ch_versions
-    sanntis_gff = SANNTIS.out.gff
+    sanntis_gff = CONCATENATE_SANNTIS_GFFS.out.concatenated_gff
     antismash_gff = CONCATENATE_ANTISMASH_GFFS.out.concatenated_gff
 }
Original file line number	Diff line number	Diff line change
`@@ -477,11 +477,14 @@ process {`
`477`	`477`	`cpus = 4`
`478`	`478`	`// TODO: we will tweak this one`
`479`	`479`	`memory = { 80.GB * task.attempt }`
	`480`	`+ }`
`480`	`481`
	`482`	`+ withName: CONCATENATE_SANNTIS_GFFS {`
	`483`	`+ ext.prefix = { "${meta.id}_sanntis" }`
`481`	`484`	`publishDir = [`
`482`	`485`	`path: { "${params.outdir}/${meta.id}/pathways-and-systems/sanntis" },`
`483`	`486`	`mode: params.publish_dir_mode,`
`484`		`- saveAs: { filename -> filename.equals('versions.yml') ? null : filename },`
	`487`	`+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename.replaceAll('_concatenated', '') },`
`485`	`488`	`]`
`486`	`489`	`}`
`487`	`490`