MPUSP · rabioinf · Dec 10, 2025 · Dec 5, 2025 · Dec 5, 2025 · Dec 5, 2025
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -2,9 +2,9 @@ name: CI
 
 on:
   push:
-    branches: [main, dev]
+    branches: [main]
   pull_request:
-    branches: [main, dev]
+    branches: [main]
 
 jobs:
   Formatting:
@@ -47,7 +47,7 @@ jobs:
         with:
           directory: .test
           snakefile: workflow/Snakefile
-          args: "--sdm conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache -n"
+          args: "--sdm conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache"
 
       - name: Test report
         uses: snakemake/[email protected]

diff --git a/.test/config/config.yml b/.test/config/config.yml
@@ -1,9 +1,17 @@
 samplesheet: "config/samples.csv"
-outdir: "results"
+tool: "prokka"
 
 pgap:
   bin: "path/to/pgap.py"
   use_yaml_config: True
   prepare_yaml_files:
     generic: "config/generic.yaml"
     submol: "config/submol.yaml"
+
+prokka:
+  center: ""
+  extra: "--addgenes"
+
+bakta:
+  db: "light"
+  extra: "--keep-contig-headers --compliant"
diff --git a/README.md b/README.md
@@ -16,7 +16,10 @@ If you use this workflow in a paper, don't forget to give credits to the authors
 ## Workflow overview
 
 1. Parse `samples.csv` table containing the samples's meta data (`python`)
-2. Annotate assemblies using NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap))
+2. Annotate assemblies using one of the following tools:
+   1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)). Note: needs to be installed manually
+   2. [prokka](https://github.com/tseemann/prokka), a fast and light-weight prokaryotic annotation tool
+   3. [bakta](https://github.com/oschwengers/bakta), a fast, alignment-free annotation tool. Note: Bakta will automatically download its companion database from zenodo (light: 1.5 GB, full: 40 GB)
 
 ## Requirements
 

diff --git a/config/README.md b/config/README.md
@@ -5,9 +5,9 @@
 This workflow requires `fasta` input data.
 The samplesheet table has the following layout:
 
-| sample | species | strain | id_prefix | file |
-| ----------- | ------------ | ------------- | ------------- | ------------- |
-| EC2224 | "Streptococcus pyogenes" | SF370 | Spy | assembly.fasta |
+| sample | species                  | strain | id_prefix | file           |
+| ------ | ------------------------ | ------ | --------- | -------------- |
+| EC2224 | "Streptococcus pyogenes" | SF370  | Spy       | assembly.fasta |
 
 ### Execution
 

diff --git a/config/config.yml b/config/config.yml
@@ -1,9 +1,17 @@
 samplesheet: "config/samples.csv"
-outdir: "results"
+tool: "prokka"
 
 pgap:
   bin: "path/to/pgap.py"
   use_yaml_config: True
   prepare_yaml_files:
     generic: "config/generic.yaml"
     submol: "config/submol.yaml"
+
+prokka:
+  center: ""
+  extra: "--addgenes"
+
+bakta:
+  db: "light"
+  extra: "--keep-contig-headers --compliant"
diff --git a/config/schemas/config.schema.yml b/config/schemas/config.schema.yml
@@ -6,9 +6,13 @@ properties:
   samplesheet:
     type: string
     description: Path to the sample sheet file
-  outdir:
+  tool:
     type: string
-    description: Output directory for results
+    description: Annotation tool to use
+    enum:
+      - prokka
+      - pgap
+      - bakta
   pgap:
     type: object
     properties:
@@ -34,7 +38,34 @@ properties:
       - bin
       - use_yaml_config
       - prepare_yaml_files
+  prokka:
+    type: object
+    properties:
+      center:
+        type: string
+        description: Center name for Prokka annotation (used in sequence IDs)
+      extra:
+        type: string
+        description: Extra command-line arguments for Prokka
+    required:
+      - center
+      - extra
+  bakta:
+    type: object
+    properties:
+      db:
+        type: string
+        description: Bakta database type, one of 'full', 'light'
+      extra:
+        type: string
+        description: Extra command-line arguments for Bakta
+    required:
+      - db
+      - extra
 
 required:
   - samplesheet
+  - tool
   - pgap
+  - prokka
+  - bakta
diff --git a/resources/.gitignore b/resources/.gitignore
@@ -0,0 +1 @@
+.*
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -19,13 +19,6 @@ container: "oras://ghcr.io/mpusp/snakemake-assembly-postprocessing:latest"
 configfile: "config/config.yml"
 
 
-# set outout directory
-OUTDIR = config.get("outdir", "")
-
-if not OUTDIR:
-    OUTDIR = f"{os.getcwd()}/results"
-
-
 # -----------------------------------------------------
 # load rules
 # -----------------------------------------------------
@@ -44,7 +37,7 @@ end = "\033[0m"
 msg = f"""\nSnakemake-assembly-postprocessing: A Snakemake workflow 
 for the post-processing of microbial genome assemblies."""
 
-prolog = f"Output directory: {OUTDIR}"
+prolog = f"Output directory: ./results"
 year = datetime.today().year
 
 epilog = f"""
@@ -74,7 +67,8 @@ onerror:
 rule all:
     input:
         expand(
-            os.path.join(OUTDIR, "annotation/pgap/{sample}/{sample}.gff"),
+            "results/annotation/{tool}/{sample}/{sample}.gff",
+            tool=config["tool"],
             sample=samples.index,
         ),
     default_target: True
diff --git a/workflow/envs/bakta.yml b/workflow/envs/bakta.yml
@@ -0,0 +1,7 @@
+name: base
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - bakta=1.9.4
diff --git a/workflow/envs/prokka.yml b/workflow/envs/prokka.yml
@@ -0,0 +1,7 @@
+name: base
+channels:
+  - conda-forge
+  - bioconda
+  - nodefaults
+dependencies:
+  - prokka=1.14.6
diff --git a/workflow/rules/annotate.smk b/workflow/rules/annotate.smk
@@ -2,15 +2,11 @@ rule get_fasta:
     input:
         get_fasta,
     output:
-        fasta=os.path.join(
-            OUTDIR, "annotation/pgap/prepare_files/{sample}/genome.fasta"
-        ),
+        fasta="results/annotation/pgap/prepare_files/{sample}/genome.fasta",
     conda:
         "../envs/base.yml"
     log:
-        os.path.join(
-            OUTDIR, "annotation/pgap/prepare_files/logs/{sample}_get_fasta.log"
-        ),
+        "results/annotation/pgap/prepare_files/logs/{sample}_get_fasta.log",
     shell:
         "INPUT=$(realpath {input}); "
         "ln -s ${{INPUT}} {output}; "
@@ -21,12 +17,8 @@ rule prepare_yaml_files:
     input:
         fasta=rules.get_fasta.output.fasta,
     output:
-        input_yaml=os.path.join(
-            OUTDIR, "annotation/pgap/prepare_files/{sample}/input.yaml"
-        ),
-        submol_yaml=os.path.join(
-            OUTDIR, "annotation/pgap/prepare_files/{sample}/submol.yaml"
-        ),
+        input_yaml="results/annotation/pgap/prepare_files/{sample}/input.yaml",
+        submol_yaml="results/annotation/pgap/prepare_files/{sample}/submol.yaml",
     conda:
         "../envs/base.yml"
     params:
@@ -37,10 +29,7 @@ rule prepare_yaml_files:
         sample="{sample}",
         pd_samples=samples,
     log:
-        os.path.join(
-            OUTDIR,
-            "annotation/pgap/prepare_files/logs/{sample}_prepare_yaml_files.log",
-        ),
+        "results/annotation/pgap/prepare_files/logs/{sample}_prepare_yaml_files.log",
     script:
         "../scripts/prepare_yaml_files.py"
 
@@ -53,19 +42,19 @@ rule annotate_pgap:
             otherwise=rules.get_fasta.output.fasta,
         ),
     output:
-        os.path.join(OUTDIR, "annotation/pgap/{sample}/{sample}.gff"),
+        "results/annotation/pgap/{sample}/{sample}.gff",
     conda:
         "../envs/base.yml"
     message:
-        """--- Run PGAP annotation for sample {wildcards.sample} ---"""
+        """--- Running PGAP annotation for sample {wildcards.sample} ---"""
     params:
         pgap=config["pgap"]["bin"],
         use_yaml_config=config["pgap"]["use_yaml_config"],
         species=lambda wc: samples.loc[wc.sample]["species"],
         outdir=lambda wc, output: os.path.dirname(output[0]),
     threads: 1
     log:
-        os.path.join(OUTDIR, "annotation/pgap/logs/{sample}_pgap.log"),
+        "results/annotation/pgap/logs/{sample}_pgap.log",
     shell:
         "rm -rf {params.outdir}; "
         "if [ {params.use_yaml_config} == 'True' ]; then "
@@ -85,3 +74,95 @@ rule annotate_pgap:
         "--no-self-update "
         "-g {input} -s '{params.species}' &>> {log}; "
         "fi; "
+
+
+rule annotate_prokka:
+    input:
+        fasta=rules.get_fasta.output.fasta,
+    output:
+        "results/annotation/prokka/{sample}/{sample}.gff",
+    conda:
+        "../envs/prokka.yml"
+    message:
+        """--- Running PROKKA annotation for sample {wildcards.sample} ---"""
+    params:
+        prefix=lambda wc: wc.sample,
+        locustag=lambda wc: samples.loc[wc.sample]["id_prefix"],
+        genus=lambda wc: samples.loc[wc.sample]["species"].split(" ")[0],
+        species=lambda wc: samples.loc[wc.sample]["species"].split(" ")[1],
+        strain=lambda wc: samples.loc[wc.sample]["strain"],
+        outdir=lambda wc, output: os.path.dirname(output[0]),
+        extra=config["prokka"]["extra"],
+    threads: workflow.cores * 0.25
+    log:
+        "results/annotation/prokka/logs/{sample}_prokka.log",
+    shell:
+        """
+        prokka \
+          --locustag {params.locustag} \
+          --genus {params.genus} \
+          --species {params.species} \
+          --strain {params.strain} \
+          --prefix {params.prefix} \
+          --outdir {params.outdir} \
+          --force {params.extra} \
+          --cpus {threads} \
+          {input.fasta} &> {log}
+        """
+
+
+rule get_bakta_db:
+    output:
+        db=directory("results/annotation/bakta/database/db"),
+    conda:
+        "../envs/bakta.yml"
+    message:
+        """--- Getting BAKTA database for annotation ---"""
+    params:
+        db=config["bakta"]["db"],
+    threads: workflow.cores * 0.25
+    log:
+        "results/annotation/bakta/database/db.log",
+    shell:
+        """
+        echo 'The most recent of the following available bakta DBs is downloaded:' > {log};
+        bakta_db list > {log};
+        bakta_db download --output {output.db} --type {params.db} &> {log}
+        """
+
+
+rule annotate_bakta:
+    input:
+        fasta=rules.get_fasta.output.fasta,
+        db=rules.get_bakta_db.output.db,
+    output:
+        "results/annotation/bakta/{sample}/{sample}.gff",
+    conda:
+        "../envs/bakta.yml"
+    message:
+        """--- Running BAKTA annotation for sample {wildcards.sample} ---"""
+    params:
+        prefix=lambda wc: wc.sample,
+        locustag=lambda wc: samples.loc[wc.sample]["id_prefix"],
+        species=lambda wc: samples.loc[wc.sample]["species"],
+        strain=lambda wc: samples.loc[wc.sample]["strain"],
+        outdir=lambda wc, output: os.path.dirname(output[0]),
+        subdir="db" if config["bakta"]["db"] == "full" else "db-light",
+        extra=config["bakta"]["extra"],
+    threads: workflow.cores * 0.25
+    log:
+        "results/annotation/bakta/logs/{sample}_bakta.log",
+    shell:
+        """
+        bakta \
+          --db {input.db}/{params.subdir} \
+          --prefix {params.prefix} \
+          --output {params.outdir} \
+          --locus-tag {params.locustag} \
+          --species '{params.species}' \
+          --strain {params.strain} \
+          --threads {threads} \
+          --force {params.extra} \
+          {input.fasta} &> {log};
+          mv {output}3 {output}
+        """