Skip to content

Commit 7cfa3b8

Browse files
fix: rename custom BAM tags to lowercase for SAM spec conformance
CL→cl, CM→cm, PT→pt (uppercase two-letter tags are reserved for standard/predefined tags). Also unwrap single-element ML arrays to scalar integers (ML:B:C:200 → cl:i:200) and add backward compat fallback in get_charging_table.py for older BAMs with uppercase tags.
1 parent a9a6a47 commit 7cfa3b8

File tree

6 files changed

+36
-23
lines changed

6 files changed

+36
-23
lines changed

CLAUDE.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,8 +113,8 @@ rebasecall → detect_edx_adapters → extract_edx_read_ids
113113
3. **ubam_to_fastq**: Extract reads from unmapped BAM to FASTQ
114114
4. **bwa_align**: Align reads to tRNA + adapter reference with BWA MEM
115115
5. **classify_charging**: Use Remora model to classify charged vs uncharged reads (adds ML tag to BAM)
116-
6. **transfer_bam_tags**: Transfer alignment tags back to classified BAM (ML→CL, MM→CM)
117-
7. **add_adapter_tags**: Detect adapter positions and add PT tags with 5'/3' boundaries
116+
6. **transfer_bam_tags**: Transfer alignment tags back to classified BAM (ML→cl, MM→cm)
117+
7. **add_adapter_tags**: Detect adapter positions and add pt tags with 5'/3' boundaries
118118
8. **finalize_bam**: Symlink adapter-tagged BAM as final output (EDX filtering now happens before alignment)
119119

120120
### Summary Generation
@@ -295,7 +295,7 @@ Outputs go to directory specified by `output_dir` in config. Test outputs: `.tes
295295
Key outputs per sample:
296296
- `summary/tables/{sample}/{sample}.charging.cpm.tsv.gz` - CPM-normalized charging counts
297297
- `summary/tables/{sample}/{sample}.charging_prob.tsv.gz` - Per-read charging probabilities
298-
- `bam/final/{sample}/{sample}.bam` - Final BAM with CL/CM (charging) and PT (adapter positions) tags
298+
- `bam/final/{sample}/{sample}.bam` - Final BAM with cl/cm (charging) and pt (adapter positions) tags
299299

300300
Pipeline-level outputs:
301301
- `squiggy-session.json` - Squiggy session file for loading samples in Positron

workflow/rules/aatrnaseq-charging.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ rule get_cca_trna:
1919
shell:
2020
"""
2121
python {params.src}/get_charging_table.py \
22-
--tag CL \
22+
--tag cl \
2323
{input.bam} \
2424
{output.charging_tab}
2525
"""

workflow/rules/aatrnaseq-process.smk

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -271,9 +271,9 @@ rule classify_charging_leech:
271271

272272
rule transfer_bam_tags:
273273
"""
274-
creates classified bam with MM and ML tags transferred to CM/CL
274+
creates classified bam with MM and ML tags transferred to cm/cl
275275
276-
MM/ML tags from the charging classification are transferred to CM/CL so as not to interfere with
276+
MM/ML tags from the charging classification are transferred to cm/cl so as not to interfere with
277277
base modifications.
278278
"""
279279
input:
@@ -294,7 +294,7 @@ rule transfer_bam_tags:
294294
"""
295295
python {params.src}/transfer_tags.py \
296296
--tags ML MM \
297-
--rename ML=CL MM=CM \
297+
--rename ML=cl MM=cm \
298298
--source {input.source_bam} \
299299
--target {input.target_bam} \
300300
--output {output.classified_bam}
@@ -306,12 +306,12 @@ rule transfer_bam_tags:
306306
rule add_adapter_tags:
307307
"""
308308
Detect adapter positions in reads using parasail alignment
309-
and add PT tags (SAM-spec read annotation format) to BAM file.
309+
and add pt tags (SAM-spec read annotation format) to BAM file.
310310
311-
PT tag format: start;end;strand;type|start;end;strand;type
312-
Example: PT:Z:0;24;+;5p_adapter|118;135;+;3p_adapter
311+
pt tag format: start;end;strand;type|start;end;strand;type
312+
Example: pt:Z:0;24;+;5p_adapter|118;135;+;3p_adapter
313313
314-
This produces the final BAM with all tags: CM/CL (charging) and PT (adapters).
314+
This produces the final BAM with all tags: cm/cl (charging) and pt (adapters).
315315
"""
316316
input:
317317
bam=rules.transfer_bam_tags.output.classified_bam,

workflow/scripts/add_adapter_tags.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,11 @@
22
"""
33
Add adapter position tags to BAM file using parasail semi-global alignment.
44
5-
Uses SAM-spec PT:Z: tag format for read annotations:
6-
PT:Z:start;end;strand;type|start;end;strand;type
5+
Uses SAM-spec pt:Z: tag format for read annotations:
6+
pt:Z:start;end;strand;type|start;end;strand;type
77
88
Example:
9-
PT:Z:0;24;+;5p_adapter|118;135;+;3p_adapter
9+
pt:Z:0;24;+;5p_adapter|118;135;+;3p_adapter
1010
1111
Positions are 0-based, relative to the read sequence.
1212
If an adapter is not found, that annotation is omitted.
@@ -204,9 +204,9 @@ def find_best_3p_adapter(read_seq, adapters, matrix, gap_open, gap_extend, min_s
204204

205205
def format_pt_tag(adapter_5p_result, adapter_3p_result):
206206
"""
207-
Format adapter positions as SAM-spec PT tag.
207+
Format adapter positions as SAM-spec pt tag.
208208
209-
Format: PT:Z:start;end;strand;type|start;end;strand;type
209+
Format: pt:Z:start;end;strand;type|start;end;strand;type
210210
211211
For 3' adapters with names, the type will be "3p_adapter_<name>"
212212
(e.g., "3p_adapter_v1" or "3p_adapter_v2").
@@ -333,7 +333,7 @@ def process_bam(
333333
# Add PT tag if any adapter found
334334
pt_value = format_pt_tag(result_5p, result_3p)
335335
if pt_value:
336-
read.set_tag("PT", pt_value, "Z")
336+
read.set_tag("pt", pt_value, "Z")
337337

338338
outbam.write(read)
339339

workflow/scripts/get_charging_table.py

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,15 +24,24 @@ def extract_tag(bam_file, output_tsv, tag):
2424
for read in bam.fetch():
2525
read_id = read.query_name
2626
reference = read.reference_name if read.reference_name else "*"
27-
tag_array = dict(read.tags).get(tag, None)
27+
tags_dict = dict(read.tags)
28+
tag_raw = tags_dict.get(tag, None)
2829

29-
# XXX: handle case where there are more than 1 tag value
30-
# not clear why this is, but we skip for now as it's a small
31-
# number of reads affected
32-
if len(tag_array) > 1:
30+
# Fallback to uppercase tag for backward compat with older BAMs
31+
# TODO: remove fallback once all BAMs have been reprocessed
32+
if tag_raw is None and tag.islower():
33+
tag_raw = tags_dict.get(tag.upper(), None)
34+
35+
if tag_raw is None:
3336
continue
3437

35-
tag_value = tag_array[0]
38+
# Handle both scalar (cl:i:200) and array (CL:B:C:200) tag values
39+
if hasattr(tag_raw, "__len__") and not isinstance(tag_raw, str):
40+
if len(tag_raw) > 1:
41+
continue
42+
tag_value = tag_raw[0]
43+
else:
44+
tag_value = tag_raw
3645

3746
if tag_value and reference != "*":
3847
writer.writerow([read_id, reference, tag_value])

workflow/scripts/transfer_tags.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,10 @@ def transfer_tags(
6363

6464
if read_tags:
6565
for tag, tag_val in read_tags.items():
66+
# Unwrap single-element arrays to scalar values
67+
# (e.g., ML:B:C:200 → cl:i:200)
68+
if hasattr(tag_val, "__len__") and not isinstance(tag_val, str) and len(tag_val) == 1:
69+
tag_val = tag_val[0]
6670
if tag in renamed_tags:
6771
read.set_tag(renamed_tags[tag], tag_val)
6872
else:

0 commit comments

Comments
 (0)