lasy
diff --git a/‎00_raw_data/isolate_assembly/README.md‎
Lines changed: 96 additions & 0 deletions b/‎00_raw_data/isolate_assembly/README.md‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎00_raw_data/isolate_assembly/config/config.schema.yaml‎
Lines changed: 105 additions & 0 deletions b/‎00_raw_data/isolate_assembly/config/config.schema.yaml‎
Lines changed: 105 additions & 0 deletions
diff --git a/‎00_raw_data/isolate_assembly/config/config.yaml‎
Lines changed: 36 additions & 0 deletions b/‎00_raw_data/isolate_assembly/config/config.yaml‎
Lines changed: 36 additions & 0 deletions
@@ -0,0 +1,96 @@
+# Bacterial Genome Assembly Pipeline
+
+A Snakemake workflow for bacterial genome assembly from paired-end Illumina reads.
+
+## Pipeline Overview
+
+This pipeline performs the following steps:
+1. **Adapter trimming** (cutadapt)
+2. **Quality filtering** (sickle)
+3. **Genome assembly** (Unicycler with integrated SPAdes)
+4. **Gene annotation** (Bakta)
+5. **Taxonomic classification** (GTDB-Tk)
+6. **Assembly statistics** (seqkit)
+7. **Quality assessment** (CheckM2)
+
+## Requirements
+
+- Snakemake (>= 7.0)
+- Conda/Mamba
+- SLURM cluster (optional, for HPC execution)
+
+## Setup
+
+1. Clone this repository
+2. Install required databases:
+   - Bakta database: https://github.com/oschwengers/bakta#database
+   - GTDB-Tk database: https://ecogenomics.github.io/GTDBTk/installing/index.html
+   - CheckM2 database: https://github.com/chklovski/CheckM2
+
+3. Update database paths in `config/config.yaml`:
+   ```yaml
+   bakta:
+     db: "/path/to/bakta/db"
+   gtdbtk:
+     gtdb_data_path: "/path/to/gtdbtk/data"
+   checkm2:
+     database_path: "/path/to/checkm2/database/uniref100.KO.1.dmnd"
+   ```
+
+4. Create your sample sheet in `config/samples.csv`:
+   ```csv
+   isolate_id,fastq_1,fastq_2
+   sample1,/path/to/sample1_R1.fastq.gz,/path/to/sample1_R2.fastq.gz
+   sample2,/path/to/sample2_R1.fastq.gz,/path/to/sample2_R2.fastq.gz
+   ```
+
+## Usage
+
+### Local execution
+```bash
+snakemake --use-conda --cores 8
+```
+
+### SLURM cluster execution
+```bash
+snakemake --use-conda --profile slurm
+```
+
+Where the SLURM profile should be configured according to your cluster specifications.
+
+### Dry run
+```bash
+snakemake -n
+```
+
+### Generate workflow diagram
+```bash
+snakemake --dag | dot -Tpng > workflow.png
+```
+
+## Output
+
+Results are organized in the `results/` directory:
+- `cutadapt/`: Adapter-trimmed reads
+- `sickle/`: Quality-filtered reads
+- `unicycler/`: Genome assemblies
+- `bakta/`: Gene annotations
+- `gtdbtk/`: Taxonomic classifications
+- `seqkit/`: Assembly statistics
+- `checkm2/`: Quality assessment reports
+- `summary/`: Combined summary tables
+
+## Configuration
+
+Edit `config/config.yaml` to adjust parameters for each tool.
+
+## Resource Requirements
+
+The pipeline is configured with SLURM resource allocations:
+- Unicycler (with SPAdes): 64GB RAM, 24 CPUs
+- GTDB-Tk: 128GB RAM, 32 CPUs
+- CheckM2: 32GB RAM, 16 CPUs
+- Bakta: 16GB RAM, 8 CPUs
+- Other tools: 2-4GB RAM, 1-4 CPUs
+
+Adjust these in the rule definitions as needed for your system.
@@ -0,0 +1,105 @@
+$schema: "https://json-schema.org/draft/2020-12/schema"
+type: object
+properties:
+  samples:
+    type: string
+    description: "Path to samples CSV file"
+  
+  cutadapt:
+    type: object
+    properties:
+      adapter_r1:
+        type: string
+        description: "3' adapter sequence to trim from R1 reads"
+      adapter_r2:
+        type: string
+        description: "3' adapter sequence to trim from R2 reads"
+      min_length:
+        type: integer
+        minimum: 1
+        description: "Minimum read length after trimming"
+      quality_cutoff:
+        type: integer
+        minimum: 0
+        description: "Quality score cutoff for trimming"
+    required: ["adapter_r1", "adapter_r2", "min_length", "quality_cutoff"]
+  
+  sickle:
+    type: object
+    properties:
+      quality_type:
+        type: string
+        enum: ["sanger", "illumina", "solexa"]
+        description: "Quality score encoding type"
+      quality_threshold:
+        type: integer
+        minimum: 0
+        description: "Minimum quality score threshold"
+      length_threshold:
+        type: integer
+        minimum: 1
+        description: "Minimum read length after quality trimming"
+    required: ["quality_type", "quality_threshold", "length_threshold"]
+  
+  unicycler:
+    type: object
+    properties:
+      mode:
+        type: string
+        enum: ["conservative", "normal", "bold"]
+        description: "Unicycler assembly mode"
+      min_fasta_length:
+        type: integer
+        minimum: 1
+        description: "Minimum contig length in output"
+      kmers:
+        type: string
+        description: "Comma-separated list of k-mer sizes for SPAdes"
+      keep:
+        type: integer
+        minimum: 0
+        maximum: 3
+        description: "Level of file retention (0-3)"
+      spades_options:
+        type: string
+        description: "Additional options to pass to SPAdes"
+    required: ["mode", "min_fasta_length", "kmers", "keep", "spades_options"]
+  
+  bakta:
+    type: object
+    properties:
+      db:
+        type: string
+        description: "Path to Bakta database"
+      genus:
+        type: string
+        description: "Genus name for annotation"
+      species:
+        type: string
+        description: "Species name for annotation"
+      min_contig_length:
+        type: integer
+        minimum: 1
+        description: "Minimum contig length to annotate"
+    required: ["db", "genus", "species", "min_contig_length"]
+  
+  gtdbtk:
+    type: object
+    properties:
+      gtdb_data_path:
+        type: string
+        description: "Path to GTDB-Tk database"
+    required: ["gtdb_data_path"]
+  
+  checkm2:
+    type: object
+    properties:
+      database_path:
+        type: string
+        description: "Path to CheckM2 database file"
+      lowmem:
+        type: string
+        description: "Low memory mode flag"
+    required: ["database_path", "lowmem"]
+
+required: ["samples", "cutadapt", "sickle", "unicycler", "bakta", "gtdbtk", "checkm2"]
@@ -0,0 +1,36 @@
+samples: "config/samples.csv"
+
+fastp:
+  # Adapter sequences (auto-detect if not specified)
+  adapter_r1: ""
+  adapter_r2: ""
+  # Quality filtering
+  qualified_quality_phred: 20
+  # Minimum read length after trimming
+  length_required: 50
+  # PolyG tail trimming for NextSeq/NovaSeq
+  trim_poly_g: true
+  # PolyX tail trimming
+  trim_poly_x: true
+  # Complexity filtering
+  complexity_threshold: 30
+
+unicycler:
+  mode: "normal"
+  min_fasta_length: 200
+  kmers: "21,33,55,77,99,127"
+  keep: 1
+  spades_options: "--careful"
+
+bakta:
+  db: "/n/groups/kwon/joseph/dbs/bakta_db_v5/"
+  genus: "Unknown"
+  species: "sp."
+  min_contig_length: 1000
+
+gtdbtk:
+  database_path: "/n/groups/kwon/joseph/dbs/gtdb"
+
+checkm2:
+  database_path: "/n/groups/kwon/joseph/dbs/checkm2/CheckM2_database/uniref100.KO.1.dmnd"
+  lowmem: "--lowmem"