feat: add tRNA reference validation and building step (#78)

jayhesselberth · claude · web-flow · commit 534f0c64f921 · 2026-01-14T10:54:53.000-07:00
Co-authored-by: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,7 @@ All notable changes to the aa-tRNA-seq pipeline are documented in this file.
 ## Unreleased
 
 ### Added
+- Reference validation and building step to ensure tRNA sequences have proper CCA endings and adapter structure required for charging classification
 - Optional WarpDemuX barcode demultiplexing support for pooled/multiplexed sequencing runs (#74)
 - Optimized modkit thresholds from ModkitOpt
 - Pixi package manager support as primary environment manager
diff --git a/config/config-base.yml b/config/config-base.yml
@@ -8,6 +8,30 @@ base_calling_model: "resources/models/rna004_130bps_sup@v5.1.0"
 # a BWA index will be built if it does not exist for this fasta file
 fasta: "resources/ref/sacCer3-mature-tRNAs-dual-adapt-v2.fa"
 
+# Adapter sequences for tRNA reference building/validation
+# These must match what the Remora charging model was trained on.
+# The charging classification uses the CCAGGC 6-mer junction:
+#   CCA = tRNA 3' end (last 3 bases of mature tRNA)
+#   GGC = first 3 bases of 3' adapter
+# Therefore, the 3' adapter MUST start with GGC for classification to work.
+adapters:
+  # 5' adapter prepended to tRNA (23bp, the first tRNA base is included as variable N)
+  five_prime: "CCTAAGAGCAAGAAGAAGCCTGG"
+  # 3' adapter appended after tRNA CCA end (40bp, starts with GGC)
+  three_prime: "GGCTTCTTCTTGCTCTTCCAACCTTGCCTTAAAAAAAAAA"
+
+# Reference validation/building mode
+# The pipeline validates that the reference FASTA has proper adapter structure
+# before alignment. This ensures the CCAGGC junction exists for charging classification.
+#
+# Modes:
+#   validate: Check existing adapted reference (default)
+#   build: Create adapted reference from raw tRNA sequences
+reference:
+  mode: "validate"
+  # For build mode only: path to raw tRNA FASTA (without adapters, must end in CCA)
+  raw_fasta: null
+
 # If a kmer table if provided then the pipeline will use get_signal_metrics.py to extract metrics using remora
 # from: https://github.com/nanoporetech/kmer_models/tree/master/rna004
 remora_kmer_table: "resources/kmers/9mer_levels_v1.txt"
@@ -24,41 +48,41 @@ dorado_model: rna004_130bps_sup@v5.1.0
 # see https://github.com/comprna/modkitopt
 # these params improve F1 by 51% (m6A) and 1251% (pseU) compared to defaults
 modkit:
-    # global threshold for canonical base confidence
-    filter_threshold: 0.5
-    # per-modification pass thresholds (mod code or ChEBI ID : threshold)
-    #   a = N6-methyladenosine (m6A)
-    #   m = 5-methylcytosine (m5C)
-    #   17802 = pseudouridine (pseU)
-    #   17596 = inosine
-    mod_thresholds:
-        a: 0.99
-        m: 0.99
-        "17802": 0.995
-        "17596": 0.99
+  # global threshold for canonical base confidence
+  filter_threshold: 0.5
+  # per-modification pass thresholds (mod code or ChEBI ID : threshold)
+  #   a = N6-methyladenosine (m6A)
+  #   m = 5-methylcytosine (m5C)
+  #   17802 = pseudouridine (pseU)
+  #   17596 = inosine
+  mod_thresholds:
+    a: 0.99
+    m: 0.99
+    "17802": 0.995
+    "17596": 0.99
 
 # additional options for particular commands
 opts:
-    # additional options for dorado basecalling
-    # XXX place modified bases first as the arg parser gets confused
-    # XXX add `-v` for verbose logging
-    dorado: " --modified-bases pseU m5C inosine_m6A --emit-moves "
+  # additional options for dorado basecalling
+  # XXX place modified bases first as the arg parser gets confused
+  # XXX add `-v` for verbose logging
+  dorado: " --modified-bases pseU m5C inosine_m6A --emit-moves "
 
-    # additional options for bwa alignment
-    # based on Novoa lab optimising bwa for tRNA alignment
-    # the -h 20 option is used to increase the number of secondary alignments reported in the XA tag
-    bwa: " -W 13 -k 6 -T 20 -x ont2d"
+  # additional options for bwa alignment
+  # based on Novoa lab optimising bwa for tRNA alignment
+  # the -h 20 option is used to increase the number of secondary alignments reported in the XA tag
+  bwa: " -W 13 -k 6 -T 20 -x ont2d"
 
-    # requires positive strand alignment
-    # requires at least 1 5' adapter base
-    # requires 1 3' adapter base in the discriminating adapter region between charged and uncharged (v2 adapters).
-    bam_filter: "-5 24 -3 23 -s"
+  # requires positive strand alignment
+  # requires at least 1 5' adapter base
+  # requires 1 3' adapter base in the discriminating adapter region between charged and uncharged (v2 adapters).
+  bam_filter: "-5 24 -3 23 -s"
 
-    #requires positive strand alignment and excludes non-primary alignments
-    coverage: "--filterRNAstrand 'reverse' --samFlagExclude 256"
+  # requires positive strand alignment and excludes non-primary alignments
+  coverage: "--filterRNAstrand 'reverse' --samFlagExclude 256"
 
-    # pass additional options to get_signal_metrics.py script which uses Remora to calculate metrics
-    remora: ""
+  # pass additional options to get_signal_metrics.py script which uses Remora to calculate metrics
+  remora: ""
 
 # WarpDemuX demultiplexing (optional, disabled by default)
 #
@@ -71,7 +95,7 @@ opts:
 #
 # WDX4_tRNA_rna004_v1_0 has improved recovery (+3-7%) compared to WDX4b_tRNA_rna004_v1_0.
 warpdemux:
-    enabled: false
-    barcode_kit: "WDX4_tRNA_rna004_v1_0"
-    save_boundaries: true
-    threads: 8
+  enabled: false
+  barcode_kit: "WDX4_tRNA_rna004_v1_0"
+  save_boundaries: true
+  threads: 8
diff --git a/docs/user-guide/configuration.md b/docs/user-guide/configuration.md
@@ -53,8 +53,74 @@ fasta: "resources/ref/sacCer3-mature-tRNAs-dual-adapt-v2.fa"
 
 A BWA index is built automatically if it doesn't exist.
 
+### Adapter Sequences
+
+The pipeline uses adapter sequences for reference validation and building. These must match what the Remora charging model was trained on:
+
+```yaml
+adapters:
+  # 5' adapter prepended to tRNA (23bp)
+  five_prime: "CCTAAGAGCAAGAAGAAGCCTGG"
+  # 3' adapter appended after tRNA CCA end (40bp)
+  three_prime: "GGCTTCTTCTTGCTCTTCCAACCTTGCCTTAAAAAAAAAA"
+```
+
+!!! important "CCAGGC Junction"
+    The charging classification uses the **CCAGGC** 6-mer junction where:
+
+    - **CCA** = last 3 bases of mature tRNA
+    - **GGC** = first 3 bases of 3' adapter
+
+    The 3' adapter **must** start with GGC for classification to work correctly.
+
+### Reference Validation and Building
+
+The pipeline validates that the reference FASTA has proper adapter structure before alignment:
+
+```yaml
+reference:
+  # Mode: "validate" (default) or "build"
+  mode: "validate"
+  # For build mode: path to raw tRNA FASTA (without adapters)
+  raw_fasta: null
+```
+
+| Mode | Description |
+|------|-------------|
+| `validate` | Check existing adapted reference has correct structure |
+| `build` | Create adapted reference from raw tRNA sequences |
+
+#### Validate Mode (Default)
+
+Checks that each sequence in your reference has:
+
+- Correct 5' adapter prefix
+- tRNA portion ending with CCA
+- Correct 3' adapter suffix (starting with GGC)
+- Valid CCAGGC junction for charging classification
+
+#### Build Mode
+
+Creates an adapted reference from raw tRNA sequences:
+
+1. Reads raw tRNA FASTA (without adapters)
+2. Adds CCA to sequences missing it (with warning)
+3. Prepends 5' adapter
+4. Appends 3' adapter after CCA
+5. Verifies CCAGGC junction is created
+
+```yaml
+# Example: building reference from raw tRNAs
+reference:
+  mode: "build"
+  raw_fasta: "resources/ref/my_raw_trnas.fa"
+```
+
 !!! info "Custom References"
-    To use a custom reference, ensure it includes both charged and uncharged tRNA variants with appropriate adapter sequences.
+    To use a custom reference, either:
+
+    1. Use `mode: "validate"` with a pre-adapted FASTA
+    2. Use `mode: "build"` with raw tRNA sequences (CCA endings required or will be added)
 
 ### Remora Models
 
diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -26,6 +26,7 @@ PIPELINE_DIR = os.path.dirname(SNAKEFILE_DIR)
 
 
 include: "rules/common.smk"
+include: "rules/aatrnaseq-reference.smk"
 include: "rules/aatrnaseq-process.smk"
 include: "rules/aatrnaseq-charging.smk"
 include: "rules/aatrnaseq-qc.smk"
diff --git a/workflow/rules/aatrnaseq-modifications.smk b/workflow/rules/aatrnaseq-modifications.smk
@@ -63,7 +63,7 @@ rule modkit_pileup:
     log:
         os.path.join(outdir, "logs", "modkit", "pileup", "{sample}"),
     params:
-        fa=config["fasta"],
+        fa=get_validated_reference(),
         threshold_opts=get_modkit_threshold_opts(),
     shell:
         """
@@ -90,7 +90,7 @@ rule modkit_extract_calls:
     log:
         os.path.join(outdir, "logs", "modkit", "extract_calls", "{sample}"),
     params:
-        fa=config["fasta"],
+        fa=get_validated_reference(),
         threshold_opts=get_modkit_threshold_opts(),
     shell:
         """
@@ -120,7 +120,7 @@ rule modkit_extract_full:
     log:
         os.path.join(outdir, "logs", "modkit", "extract_full", "{sample}"),
     params:
-        fa=config["fasta"],
+        fa=get_validated_reference(),
         threshold_opts=get_modkit_threshold_opts(),
     shell:
         """
diff --git a/workflow/rules/aatrnaseq-process.smk b/workflow/rules/aatrnaseq-process.smk
@@ -67,10 +67,14 @@ rule ubam_to_fastq:
 
 
 rule bwa_idx:
+    """
+    Build BWA index for the validated/built reference.
+    Depends on reference validation/building completing first.
+    """
     input:
-        config["fasta"],
+        get_validated_reference(),
     output:
-        multiext(config["fasta"], ".amb", ".ann", ".bwt", ".pac", ".sa"),
+        multiext(get_validated_reference(), ".amb", ".ann", ".bwt", ".pac", ".sa"),
     log:
         os.path.join(outdir, "logs", "bwa_idx", "log"),
     shell:
@@ -81,16 +85,17 @@ rule bwa_idx:
 
 rule bwa_align:
     """
-  align reads to tRNA references with bwa mem
-  """
+    Align reads to tRNA references with bwa mem.
+    Uses the validated/built reference.
+    """
     input:
         reads=rules.ubam_to_fastq.output,
         idx=rules.bwa_idx.output,
     output:
         bam=os.path.join(outdir, "bam", "aln", "{sample}", "{sample}.aln.bam"),
         bai=os.path.join(outdir, "bam", "aln", "{sample}", "{sample}.aln.bam.bai"),
     params:
-        index=config["fasta"],
+        index=get_validated_reference(),
         bwa_opts=config["opts"]["bwa"],
     log:
         os.path.join(outdir, "logs", "bwa_align", "{sample}"),
diff --git a/workflow/rules/aatrnaseq-qc.smk b/workflow/rules/aatrnaseq-qc.smk
@@ -17,7 +17,7 @@ rule base_calling_error:
         os.path.join(outdir, "logs", "bcerror", "{sample}.bwa"),
     params:
         src=SCRIPT_DIR,
-        fa=config["fasta"],
+        fa=get_validated_reference(),
     shell:
         """
     python {params.src}/get_bcerror_freqs.py \
diff --git a/workflow/rules/aatrnaseq-reference.smk b/workflow/rules/aatrnaseq-reference.smk
@@ -0,0 +1,114 @@
+"""
+Rules for tRNA reference validation and building.
+
+Ensures reference FASTA has correct adapter structure before alignment.
+The CCAGGC junction (CCA from tRNA + GGC from 3' adapter) is required
+for the Remora charging classification model.
+
+Modes:
+  validate: Check existing adapted reference (default)
+  build: Create adapted reference from raw tRNA sequences
+"""
+
+
+def get_adapter_5p():
+    """Get 5' adapter sequence from config with default fallback."""
+    return config.get("adapters", {}).get("five_prime", "CCTAAGAGCAAGAAGAAGCCTGG")
+
+
+def get_adapter_3p():
+    """Get 3' adapter sequence from config with default fallback."""
+    return config.get("adapters", {}).get(
+        "three_prime", "GGCTTCTTCTTGCTCTTCCAACCTTGCCTTAAAAAAAAAA"
+    )
+
+
+def get_reference_mode():
+    """Get reference processing mode (validate or build)."""
+    return config.get("reference", {}).get("mode", "validate")
+
+
+def get_validated_reference():
+    """
+    Return path to validated/built reference based on mode.
+    This is used by downstream rules (bwa_idx, bwa_align, etc.).
+    """
+    mode = get_reference_mode()
+    if mode == "build":
+        return os.path.join(outdir, "reference", "adapted.fa")
+    return os.path.join(outdir, "reference", "validated.fa")
+
+
+rule validate_reference:
+    """
+    Validate that an existing reference FASTA has correct adapter structure.
+
+    Checks:
+    - All sequences have correct 5' adapter prefix
+    - All tRNA portions end with CCA
+    - All sequences have correct 3' adapter suffix (starting with GGC)
+    - CCAGGC junction exists for charging classification
+    - No duplicate sequence names
+
+    Pipeline fails if validation fails.
+    """
+    input:
+        fasta=config["fasta"],
+    output:
+        validated=os.path.join(outdir, "reference", "validated.fa"),
+        report=os.path.join(outdir, "reference", "validation_report.txt"),
+    log:
+        os.path.join(outdir, "logs", "reference", "validate.log"),
+    params:
+        script=os.path.join(SCRIPT_DIR, "build_trna_reference.py"),
+        adapter_5p=get_adapter_5p(),
+        adapter_3p=get_adapter_3p(),
+    shell:
+        """
+        python {params.script} \
+            --mode validate \
+            --input {input.fasta} \
+            --output {output.validated} \
+            --report {output.report} \
+            --adapter-5p "{params.adapter_5p}" \
+            --adapter-3p "{params.adapter_3p}" \
+            2>&1 | tee {log}
+        """
+
+
+rule build_reference:
+    """
+    Build an adapted tRNA reference from raw tRNA sequences.
+
+    Input: Raw tRNA FASTA (sequences without adapters)
+    Output: Adapted reference FASTA with 5' and 3' adapters
+
+    Steps:
+    1. Check for CCA endings - add CCA if missing (with warning)
+    2. Prepend 5' adapter to each tRNA
+    3. Append 3' adapter after CCA
+    4. Verify CCAGGC junction is created
+    5. Write adapted FASTA
+    """
+    input:
+        raw_fasta=lambda wildcards: config["reference"]["raw_fasta"],
+    output:
+        adapted=os.path.join(outdir, "reference", "adapted.fa"),
+        report=os.path.join(outdir, "reference", "build_report.txt"),
+    log:
+        os.path.join(outdir, "logs", "reference", "build.log"),
+    params:
+        script=os.path.join(SCRIPT_DIR, "build_trna_reference.py"),
+        adapter_5p=get_adapter_5p(),
+        adapter_3p=get_adapter_3p(),
+    shell:
+        """
+        python {params.script} \
+            --mode build \
+            --input {input.raw_fasta} \
+            --output {output.adapted} \
+            --report {output.report} \
+            --adapter-5p "{params.adapter_5p}" \
+            --adapter-3p "{params.adapter_3p}" \
+            2>&1 | tee {log}
+        """
diff --git a/workflow/scripts/build_trna_reference.py b/workflow/scripts/build_trna_reference.py