Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ name: CI

on:
push:
branches: [main, dev]
branches: [main]
pull_request:
branches: [main, dev]
branches: [main]

jobs:
Formatting:
Expand Down Expand Up @@ -47,7 +47,7 @@ jobs:
with:
directory: .test
snakefile: workflow/Snakefile
args: "--sdm conda --show-failed-logs --cores 1 --conda-cleanup-pkgs cache -n"
args: "--sdm conda --show-failed-logs --cores 3 --conda-cleanup-pkgs cache"

- name: Test report
uses: snakemake/[email protected]
Expand Down
10 changes: 9 additions & 1 deletion .test/config/config.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
samplesheet: "config/samples.csv"
outdir: "results"
tool: "prokka"

pgap:
bin: "path/to/pgap.py"
use_yaml_config: True
prepare_yaml_files:
generic: "config/generic.yaml"
submol: "config/submol.yaml"

prokka:
center: ""
extra: "--addgenes"

bakta:
db: "light"
extra: "--keep-contig-headers --compliant"
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@ If you use this workflow in a paper, don't forget to give credits to the authors
## Workflow overview

1. Parse `samples.csv` table containing the samples's meta data (`python`)
2. Annotate assemblies using NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap))
2. Annotate assemblies using one of the following tools:
1. NCBI's Prokaryotic Genome Annotation Pipeline ([PGAP](https://github.com/ncbi/pgap)). Note: needs to be installed manually
2. [prokka](https://github.com/tseemann/prokka), a fast and light-weight prokaryotic annotation tool
3. [bakta](https://github.com/oschwengers/bakta), a fast, alignment-free annotation tool. Note: Bakta will automatically download its companion database from zenodo (light: 1.5 GB, full: 40 GB)

## Requirements

Expand Down
6 changes: 3 additions & 3 deletions config/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
This workflow requires `fasta` input data.
The samplesheet table has the following layout:

| sample | species | strain | id_prefix | file |
| ----------- | ------------ | ------------- | ------------- | ------------- |
| EC2224 | "Streptococcus pyogenes" | SF370 | Spy | assembly.fasta |
| sample | species | strain | id_prefix | file |
| ------ | ------------------------ | ------ | --------- | -------------- |
| EC2224 | "Streptococcus pyogenes" | SF370 | Spy | assembly.fasta |

### Execution

Expand Down
10 changes: 9 additions & 1 deletion config/config.yml
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
samplesheet: "config/samples.csv"
outdir: "results"
tool: "prokka"

pgap:
bin: "path/to/pgap.py"
use_yaml_config: True
prepare_yaml_files:
generic: "config/generic.yaml"
submol: "config/submol.yaml"

prokka:
center: ""
extra: "--addgenes"

bakta:
db: "light"
extra: "--keep-contig-headers --compliant"
35 changes: 33 additions & 2 deletions config/schemas/config.schema.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,13 @@ properties:
samplesheet:
type: string
description: Path to the sample sheet file
outdir:
tool:
type: string
description: Output directory for results
description: Annotation tool to use
enum:
- prokka
- pgap
- bakta
pgap:
type: object
properties:
Expand All @@ -34,7 +38,34 @@ properties:
- bin
- use_yaml_config
- prepare_yaml_files
prokka:
type: object
properties:
center:
type: string
description: Center name for Prokka annotation (used in sequence IDs)
extra:
type: string
description: Extra command-line arguments for Prokka
required:
- center
- extra
bakta:
type: object
properties:
db:
type: string
description: Bakta database type, one of 'full', 'light'
extra:
type: string
description: Extra command-line arguments for Bakta
required:
- db
- extra

required:
- samplesheet
- tool
- pgap
- prokka
- bakta
1 change: 1 addition & 0 deletions resources/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.*
12 changes: 3 additions & 9 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,6 @@ container: "oras://ghcr.io/mpusp/snakemake-assembly-postprocessing:latest"
configfile: "config/config.yml"


# set outout directory
OUTDIR = config.get("outdir", "")

if not OUTDIR:
OUTDIR = f"{os.getcwd()}/results"


# -----------------------------------------------------
# load rules
# -----------------------------------------------------
Expand All @@ -44,7 +37,7 @@ end = "\033[0m"
msg = f"""\nSnakemake-assembly-postprocessing: A Snakemake workflow
for the post-processing of microbial genome assemblies."""

prolog = f"Output directory: {OUTDIR}"
prolog = f"Output directory: ./results"
year = datetime.today().year

epilog = f"""
Expand Down Expand Up @@ -74,7 +67,8 @@ onerror:
rule all:
input:
expand(
os.path.join(OUTDIR, "annotation/pgap/{sample}/{sample}.gff"),
"results/annotation/{tool}/{sample}/{sample}.gff",
tool=config["tool"],
sample=samples.index,
),
default_target: True
7 changes: 7 additions & 0 deletions workflow/envs/bakta.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: base
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- bakta=1.9.4
7 changes: 7 additions & 0 deletions workflow/envs/prokka.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: base
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- prokka=1.14.6
119 changes: 100 additions & 19 deletions workflow/rules/annotate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,11 @@ rule get_fasta:
input:
get_fasta,
output:
fasta=os.path.join(
OUTDIR, "annotation/pgap/prepare_files/{sample}/genome.fasta"
),
fasta="results/annotation/pgap/prepare_files/{sample}/genome.fasta",
conda:
"../envs/base.yml"
log:
os.path.join(
OUTDIR, "annotation/pgap/prepare_files/logs/{sample}_get_fasta.log"
),
"results/annotation/pgap/prepare_files/logs/{sample}_get_fasta.log",
shell:
"INPUT=$(realpath {input}); "
"ln -s ${{INPUT}} {output}; "
Expand All @@ -21,12 +17,8 @@ rule prepare_yaml_files:
input:
fasta=rules.get_fasta.output.fasta,
output:
input_yaml=os.path.join(
OUTDIR, "annotation/pgap/prepare_files/{sample}/input.yaml"
),
submol_yaml=os.path.join(
OUTDIR, "annotation/pgap/prepare_files/{sample}/submol.yaml"
),
input_yaml="results/annotation/pgap/prepare_files/{sample}/input.yaml",
submol_yaml="results/annotation/pgap/prepare_files/{sample}/submol.yaml",
conda:
"../envs/base.yml"
params:
Expand All @@ -37,10 +29,7 @@ rule prepare_yaml_files:
sample="{sample}",
pd_samples=samples,
log:
os.path.join(
OUTDIR,
"annotation/pgap/prepare_files/logs/{sample}_prepare_yaml_files.log",
),
"results/annotation/pgap/prepare_files/logs/{sample}_prepare_yaml_files.log",
script:
"../scripts/prepare_yaml_files.py"

Expand All @@ -53,19 +42,19 @@ rule annotate_pgap:
otherwise=rules.get_fasta.output.fasta,
),
output:
os.path.join(OUTDIR, "annotation/pgap/{sample}/{sample}.gff"),
"results/annotation/pgap/{sample}/{sample}.gff",
conda:
"../envs/base.yml"
message:
"""--- Run PGAP annotation for sample {wildcards.sample} ---"""
"""--- Running PGAP annotation for sample {wildcards.sample} ---"""
params:
pgap=config["pgap"]["bin"],
use_yaml_config=config["pgap"]["use_yaml_config"],
species=lambda wc: samples.loc[wc.sample]["species"],
outdir=lambda wc, output: os.path.dirname(output[0]),
threads: 1
log:
os.path.join(OUTDIR, "annotation/pgap/logs/{sample}_pgap.log"),
"results/annotation/pgap/logs/{sample}_pgap.log",
shell:
"rm -rf {params.outdir}; "
"if [ {params.use_yaml_config} == 'True' ]; then "
Expand All @@ -85,3 +74,95 @@ rule annotate_pgap:
"--no-self-update "
"-g {input} -s '{params.species}' &>> {log}; "
"fi; "


rule annotate_prokka:
input:
fasta=rules.get_fasta.output.fasta,
output:
"results/annotation/prokka/{sample}/{sample}.gff",
conda:
"../envs/prokka.yml"
message:
"""--- Running PROKKA annotation for sample {wildcards.sample} ---"""
params:
prefix=lambda wc: wc.sample,
locustag=lambda wc: samples.loc[wc.sample]["id_prefix"],
genus=lambda wc: samples.loc[wc.sample]["species"].split(" ")[0],
species=lambda wc: samples.loc[wc.sample]["species"].split(" ")[1],
strain=lambda wc: samples.loc[wc.sample]["strain"],
outdir=lambda wc, output: os.path.dirname(output[0]),
extra=config["prokka"]["extra"],
threads: workflow.cores * 0.25
log:
"results/annotation/prokka/logs/{sample}_prokka.log",
shell:
"""
prokka \
--locustag {params.locustag} \
--genus {params.genus} \
--species {params.species} \
--strain {params.strain} \
--prefix {params.prefix} \
--outdir {params.outdir} \
--force {params.extra} \
--cpus {threads} \
{input.fasta} &> {log}
"""


rule get_bakta_db:
output:
db=directory("results/annotation/bakta/database/db"),
conda:
"../envs/bakta.yml"
message:
"""--- Getting BAKTA database for annotation ---"""
params:
db=config["bakta"]["db"],
threads: workflow.cores * 0.25
log:
"results/annotation/bakta/database/db.log",
shell:
"""
echo 'The most recent of the following available bakta DBs is downloaded:' > {log};
bakta_db list > {log};
bakta_db download --output {output.db} --type {params.db} &> {log}
"""


rule annotate_bakta:
input:
fasta=rules.get_fasta.output.fasta,
db=rules.get_bakta_db.output.db,
output:
"results/annotation/bakta/{sample}/{sample}.gff",
conda:
"../envs/bakta.yml"
message:
"""--- Running BAKTA annotation for sample {wildcards.sample} ---"""
params:
prefix=lambda wc: wc.sample,
locustag=lambda wc: samples.loc[wc.sample]["id_prefix"],
species=lambda wc: samples.loc[wc.sample]["species"],
strain=lambda wc: samples.loc[wc.sample]["strain"],
outdir=lambda wc, output: os.path.dirname(output[0]),
subdir="db" if config["bakta"]["db"] == "full" else "db-light",
extra=config["bakta"]["extra"],
threads: workflow.cores * 0.25
log:
"results/annotation/bakta/logs/{sample}_bakta.log",
shell:
"""
bakta \
--db {input.db}/{params.subdir} \
--prefix {params.prefix} \
--output {params.outdir} \
--locus-tag {params.locustag} \
--species '{params.species}' \
--strain {params.strain} \
--threads {threads} \
--force {params.extra} \
{input.fasta} &> {log};
mv {output}3 {output}
"""
Loading