Skip to content

Commit 534f0c6

Browse files
feat: add tRNA reference validation and building step (#78)
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 1009918 commit 534f0c6

File tree

9 files changed

+670
-42
lines changed

9 files changed

+670
-42
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ All notable changes to the aa-tRNA-seq pipeline are documented in this file.
55
## Unreleased
66

77
### Added
8+
- Reference validation and building step to ensure tRNA sequences have proper CCA endings and adapter structure required for charging classification
89
- Optional WarpDemuX barcode demultiplexing support for pooled/multiplexed sequencing runs (#74)
910
- Optimized modkit thresholds from ModkitOpt
1011
- Pixi package manager support as primary environment manager

config/config-base.yml

Lines changed: 56 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,30 @@ base_calling_model: "resources/models/rna004_130bps_sup@v5.1.0"
88
# a BWA index will be built if it does not exist for this fasta file
99
fasta: "resources/ref/sacCer3-mature-tRNAs-dual-adapt-v2.fa"
1010

11+
# Adapter sequences for tRNA reference building/validation
12+
# These must match what the Remora charging model was trained on.
13+
# The charging classification uses the CCAGGC 6-mer junction:
14+
# CCA = tRNA 3' end (last 3 bases of mature tRNA)
15+
# GGC = first 3 bases of 3' adapter
16+
# Therefore, the 3' adapter MUST start with GGC for classification to work.
17+
adapters:
18+
# 5' adapter prepended to tRNA (23bp, the first tRNA base is included as variable N)
19+
five_prime: "CCTAAGAGCAAGAAGAAGCCTGG"
20+
# 3' adapter appended after tRNA CCA end (40bp, starts with GGC)
21+
three_prime: "GGCTTCTTCTTGCTCTTCCAACCTTGCCTTAAAAAAAAAA"
22+
23+
# Reference validation/building mode
24+
# The pipeline validates that the reference FASTA has proper adapter structure
25+
# before alignment. This ensures the CCAGGC junction exists for charging classification.
26+
#
27+
# Modes:
28+
# validate: Check existing adapted reference (default)
29+
# build: Create adapted reference from raw tRNA sequences
30+
reference:
31+
mode: "validate"
32+
# For build mode only: path to raw tRNA FASTA (without adapters, must end in CCA)
33+
raw_fasta: null
34+
1135
# If a kmer table if provided then the pipeline will use get_signal_metrics.py to extract metrics using remora
1236
# from: https://github.com/nanoporetech/kmer_models/tree/master/rna004
1337
remora_kmer_table: "resources/kmers/9mer_levels_v1.txt"
@@ -24,41 +48,41 @@ dorado_model: rna004_130bps_sup@v5.1.0
2448
# see https://github.com/comprna/modkitopt
2549
# these params improve F1 by 51% (m6A) and 1251% (pseU) compared to defaults
2650
modkit:
27-
# global threshold for canonical base confidence
28-
filter_threshold: 0.5
29-
# per-modification pass thresholds (mod code or ChEBI ID : threshold)
30-
# a = N6-methyladenosine (m6A)
31-
# m = 5-methylcytosine (m5C)
32-
# 17802 = pseudouridine (pseU)
33-
# 17596 = inosine
34-
mod_thresholds:
35-
a: 0.99
36-
m: 0.99
37-
"17802": 0.995
38-
"17596": 0.99
51+
# global threshold for canonical base confidence
52+
filter_threshold: 0.5
53+
# per-modification pass thresholds (mod code or ChEBI ID : threshold)
54+
# a = N6-methyladenosine (m6A)
55+
# m = 5-methylcytosine (m5C)
56+
# 17802 = pseudouridine (pseU)
57+
# 17596 = inosine
58+
mod_thresholds:
59+
a: 0.99
60+
m: 0.99
61+
"17802": 0.995
62+
"17596": 0.99
3963

4064
# additional options for particular commands
4165
opts:
42-
# additional options for dorado basecalling
43-
# XXX place modified bases first as the arg parser gets confused
44-
# XXX add `-v` for verbose logging
45-
dorado: " --modified-bases pseU m5C inosine_m6A --emit-moves "
66+
# additional options for dorado basecalling
67+
# XXX place modified bases first as the arg parser gets confused
68+
# XXX add `-v` for verbose logging
69+
dorado: " --modified-bases pseU m5C inosine_m6A --emit-moves "
4670

47-
# additional options for bwa alignment
48-
# based on Novoa lab optimising bwa for tRNA alignment
49-
# the -h 20 option is used to increase the number of secondary alignments reported in the XA tag
50-
bwa: " -W 13 -k 6 -T 20 -x ont2d"
71+
# additional options for bwa alignment
72+
# based on Novoa lab optimising bwa for tRNA alignment
73+
# the -h 20 option is used to increase the number of secondary alignments reported in the XA tag
74+
bwa: " -W 13 -k 6 -T 20 -x ont2d"
5175

52-
# requires positive strand alignment
53-
# requires at least 1 5' adapter base
54-
# requires 1 3' adapter base in the discriminating adapter region between charged and uncharged (v2 adapters).
55-
bam_filter: "-5 24 -3 23 -s"
76+
# requires positive strand alignment
77+
# requires at least 1 5' adapter base
78+
# requires 1 3' adapter base in the discriminating adapter region between charged and uncharged (v2 adapters).
79+
bam_filter: "-5 24 -3 23 -s"
5680

57-
#requires positive strand alignment and excludes non-primary alignments
58-
coverage: "--filterRNAstrand 'reverse' --samFlagExclude 256"
81+
# requires positive strand alignment and excludes non-primary alignments
82+
coverage: "--filterRNAstrand 'reverse' --samFlagExclude 256"
5983

60-
# pass additional options to get_signal_metrics.py script which uses Remora to calculate metrics
61-
remora: ""
84+
# pass additional options to get_signal_metrics.py script which uses Remora to calculate metrics
85+
remora: ""
6286

6387
# WarpDemuX demultiplexing (optional, disabled by default)
6488
#
@@ -71,7 +95,7 @@ opts:
7195
#
7296
# WDX4_tRNA_rna004_v1_0 has improved recovery (+3-7%) compared to WDX4b_tRNA_rna004_v1_0.
7397
warpdemux:
74-
enabled: false
75-
barcode_kit: "WDX4_tRNA_rna004_v1_0"
76-
save_boundaries: true
77-
threads: 8
98+
enabled: false
99+
barcode_kit: "WDX4_tRNA_rna004_v1_0"
100+
save_boundaries: true
101+
threads: 8

docs/user-guide/configuration.md

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,8 +53,74 @@ fasta: "resources/ref/sacCer3-mature-tRNAs-dual-adapt-v2.fa"
5353
5454
A BWA index is built automatically if it doesn't exist.
5555
56+
### Adapter Sequences
57+
58+
The pipeline uses adapter sequences for reference validation and building. These must match what the Remora charging model was trained on:
59+
60+
```yaml
61+
adapters:
62+
# 5' adapter prepended to tRNA (23bp)
63+
five_prime: "CCTAAGAGCAAGAAGAAGCCTGG"
64+
# 3' adapter appended after tRNA CCA end (40bp)
65+
three_prime: "GGCTTCTTCTTGCTCTTCCAACCTTGCCTTAAAAAAAAAA"
66+
```
67+
68+
!!! important "CCAGGC Junction"
69+
The charging classification uses the **CCAGGC** 6-mer junction where:
70+
71+
- **CCA** = last 3 bases of mature tRNA
72+
- **GGC** = first 3 bases of 3' adapter
73+
74+
The 3' adapter **must** start with GGC for classification to work correctly.
75+
76+
### Reference Validation and Building
77+
78+
The pipeline validates that the reference FASTA has proper adapter structure before alignment:
79+
80+
```yaml
81+
reference:
82+
# Mode: "validate" (default) or "build"
83+
mode: "validate"
84+
# For build mode: path to raw tRNA FASTA (without adapters)
85+
raw_fasta: null
86+
```
87+
88+
| Mode | Description |
89+
|------|-------------|
90+
| `validate` | Check existing adapted reference has correct structure |
91+
| `build` | Create adapted reference from raw tRNA sequences |
92+
93+
#### Validate Mode (Default)
94+
95+
Checks that each sequence in your reference has:
96+
97+
- Correct 5' adapter prefix
98+
- tRNA portion ending with CCA
99+
- Correct 3' adapter suffix (starting with GGC)
100+
- Valid CCAGGC junction for charging classification
101+
102+
#### Build Mode
103+
104+
Creates an adapted reference from raw tRNA sequences:
105+
106+
1. Reads raw tRNA FASTA (without adapters)
107+
2. Adds CCA to sequences missing it (with warning)
108+
3. Prepends 5' adapter
109+
4. Appends 3' adapter after CCA
110+
5. Verifies CCAGGC junction is created
111+
112+
```yaml
113+
# Example: building reference from raw tRNAs
114+
reference:
115+
mode: "build"
116+
raw_fasta: "resources/ref/my_raw_trnas.fa"
117+
```
118+
56119
!!! info "Custom References"
57-
To use a custom reference, ensure it includes both charged and uncharged tRNA variants with appropriate adapter sequences.
120+
To use a custom reference, either:
121+
122+
1. Use `mode: "validate"` with a pre-adapted FASTA
123+
2. Use `mode: "build"` with raw tRNA sequences (CCA endings required or will be added)
58124

59125
### Remora Models
60126

workflow/Snakefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ PIPELINE_DIR = os.path.dirname(SNAKEFILE_DIR)
2626

2727

2828
include: "rules/common.smk"
29+
include: "rules/aatrnaseq-reference.smk"
2930
include: "rules/aatrnaseq-process.smk"
3031
include: "rules/aatrnaseq-charging.smk"
3132
include: "rules/aatrnaseq-qc.smk"

workflow/rules/aatrnaseq-modifications.smk

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ rule modkit_pileup:
6363
log:
6464
os.path.join(outdir, "logs", "modkit", "pileup", "{sample}"),
6565
params:
66-
fa=config["fasta"],
66+
fa=get_validated_reference(),
6767
threshold_opts=get_modkit_threshold_opts(),
6868
shell:
6969
"""
@@ -90,7 +90,7 @@ rule modkit_extract_calls:
9090
log:
9191
os.path.join(outdir, "logs", "modkit", "extract_calls", "{sample}"),
9292
params:
93-
fa=config["fasta"],
93+
fa=get_validated_reference(),
9494
threshold_opts=get_modkit_threshold_opts(),
9595
shell:
9696
"""
@@ -120,7 +120,7 @@ rule modkit_extract_full:
120120
log:
121121
os.path.join(outdir, "logs", "modkit", "extract_full", "{sample}"),
122122
params:
123-
fa=config["fasta"],
123+
fa=get_validated_reference(),
124124
threshold_opts=get_modkit_threshold_opts(),
125125
shell:
126126
"""

workflow/rules/aatrnaseq-process.smk

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,14 @@ rule ubam_to_fastq:
6767

6868

6969
rule bwa_idx:
70+
"""
71+
Build BWA index for the validated/built reference.
72+
Depends on reference validation/building completing first.
73+
"""
7074
input:
71-
config["fasta"],
75+
get_validated_reference(),
7276
output:
73-
multiext(config["fasta"], ".amb", ".ann", ".bwt", ".pac", ".sa"),
77+
multiext(get_validated_reference(), ".amb", ".ann", ".bwt", ".pac", ".sa"),
7478
log:
7579
os.path.join(outdir, "logs", "bwa_idx", "log"),
7680
shell:
@@ -81,16 +85,17 @@ rule bwa_idx:
8185

8286
rule bwa_align:
8387
"""
84-
align reads to tRNA references with bwa mem
85-
"""
88+
Align reads to tRNA references with bwa mem.
89+
Uses the validated/built reference.
90+
"""
8691
input:
8792
reads=rules.ubam_to_fastq.output,
8893
idx=rules.bwa_idx.output,
8994
output:
9095
bam=os.path.join(outdir, "bam", "aln", "{sample}", "{sample}.aln.bam"),
9196
bai=os.path.join(outdir, "bam", "aln", "{sample}", "{sample}.aln.bam.bai"),
9297
params:
93-
index=config["fasta"],
98+
index=get_validated_reference(),
9499
bwa_opts=config["opts"]["bwa"],
95100
log:
96101
os.path.join(outdir, "logs", "bwa_align", "{sample}"),

workflow/rules/aatrnaseq-qc.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ rule base_calling_error:
1717
os.path.join(outdir, "logs", "bcerror", "{sample}.bwa"),
1818
params:
1919
src=SCRIPT_DIR,
20-
fa=config["fasta"],
20+
fa=get_validated_reference(),
2121
shell:
2222
"""
2323
python {params.src}/get_bcerror_freqs.py \
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
"""
2+
Rules for tRNA reference validation and building.
3+
4+
Ensures reference FASTA has correct adapter structure before alignment.
5+
The CCAGGC junction (CCA from tRNA + GGC from 3' adapter) is required
6+
for the Remora charging classification model.
7+
8+
Modes:
9+
validate: Check existing adapted reference (default)
10+
build: Create adapted reference from raw tRNA sequences
11+
"""
12+
13+
14+
def get_adapter_5p():
15+
"""Get 5' adapter sequence from config with default fallback."""
16+
return config.get("adapters", {}).get("five_prime", "CCTAAGAGCAAGAAGAAGCCTGG")
17+
18+
19+
def get_adapter_3p():
20+
"""Get 3' adapter sequence from config with default fallback."""
21+
return config.get("adapters", {}).get(
22+
"three_prime", "GGCTTCTTCTTGCTCTTCCAACCTTGCCTTAAAAAAAAAA"
23+
)
24+
25+
26+
def get_reference_mode():
27+
"""Get reference processing mode (validate or build)."""
28+
return config.get("reference", {}).get("mode", "validate")
29+
30+
31+
def get_validated_reference():
32+
"""
33+
Return path to validated/built reference based on mode.
34+
This is used by downstream rules (bwa_idx, bwa_align, etc.).
35+
"""
36+
mode = get_reference_mode()
37+
if mode == "build":
38+
return os.path.join(outdir, "reference", "adapted.fa")
39+
return os.path.join(outdir, "reference", "validated.fa")
40+
41+
42+
rule validate_reference:
43+
"""
44+
Validate that an existing reference FASTA has correct adapter structure.
45+
46+
Checks:
47+
- All sequences have correct 5' adapter prefix
48+
- All tRNA portions end with CCA
49+
- All sequences have correct 3' adapter suffix (starting with GGC)
50+
- CCAGGC junction exists for charging classification
51+
- No duplicate sequence names
52+
53+
Pipeline fails if validation fails.
54+
"""
55+
input:
56+
fasta=config["fasta"],
57+
output:
58+
validated=os.path.join(outdir, "reference", "validated.fa"),
59+
report=os.path.join(outdir, "reference", "validation_report.txt"),
60+
log:
61+
os.path.join(outdir, "logs", "reference", "validate.log"),
62+
params:
63+
script=os.path.join(SCRIPT_DIR, "build_trna_reference.py"),
64+
adapter_5p=get_adapter_5p(),
65+
adapter_3p=get_adapter_3p(),
66+
shell:
67+
"""
68+
python {params.script} \
69+
--mode validate \
70+
--input {input.fasta} \
71+
--output {output.validated} \
72+
--report {output.report} \
73+
--adapter-5p "{params.adapter_5p}" \
74+
--adapter-3p "{params.adapter_3p}" \
75+
2>&1 | tee {log}
76+
"""
77+
78+
79+
rule build_reference:
80+
"""
81+
Build an adapted tRNA reference from raw tRNA sequences.
82+
83+
Input: Raw tRNA FASTA (sequences without adapters)
84+
Output: Adapted reference FASTA with 5' and 3' adapters
85+
86+
Steps:
87+
1. Check for CCA endings - add CCA if missing (with warning)
88+
2. Prepend 5' adapter to each tRNA
89+
3. Append 3' adapter after CCA
90+
4. Verify CCAGGC junction is created
91+
5. Write adapted FASTA
92+
"""
93+
input:
94+
raw_fasta=lambda wildcards: config["reference"]["raw_fasta"],
95+
output:
96+
adapted=os.path.join(outdir, "reference", "adapted.fa"),
97+
report=os.path.join(outdir, "reference", "build_report.txt"),
98+
log:
99+
os.path.join(outdir, "logs", "reference", "build.log"),
100+
params:
101+
script=os.path.join(SCRIPT_DIR, "build_trna_reference.py"),
102+
adapter_5p=get_adapter_5p(),
103+
adapter_3p=get_adapter_3p(),
104+
shell:
105+
"""
106+
python {params.script} \
107+
--mode build \
108+
--input {input.raw_fasta} \
109+
--output {output.adapted} \
110+
--report {output.report} \
111+
--adapter-5p "{params.adapter_5p}" \
112+
--adapter-3p "{params.adapter_3p}" \
113+
2>&1 | tee {log}
114+
"""

0 commit comments

Comments
 (0)