fix: rename custom BAM tags to lowercase for SAM spec conformance

jayhesselberth · jayhesselberth · commit 7cfa3b840597 · 2026-03-08T14:19:14.000-06:00
CL→cl, CM→cm, PT→pt (uppercase two-letter tags are reserved for
standard/predefined tags). Also unwrap single-element ML arrays to
scalar integers (ML:B:C:200 → cl:i:200) and add backward compat
fallback in get_charging_table.py for older BAMs with uppercase tags.
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -113,8 +113,8 @@ rebasecall → detect_edx_adapters → extract_edx_read_ids
 3. **ubam_to_fastq**: Extract reads from unmapped BAM to FASTQ
 4. **bwa_align**: Align reads to tRNA + adapter reference with BWA MEM
 5. **classify_charging**: Use Remora model to classify charged vs uncharged reads (adds ML tag to BAM)
-6. **transfer_bam_tags**: Transfer alignment tags back to classified BAM (ML→CL, MM→CM)
-7. **add_adapter_tags**: Detect adapter positions and add PT tags with 5'/3' boundaries
+6. **transfer_bam_tags**: Transfer alignment tags back to classified BAM (ML→cl, MM→cm)
+7. **add_adapter_tags**: Detect adapter positions and add pt tags with 5'/3' boundaries
 8. **finalize_bam**: Symlink adapter-tagged BAM as final output (EDX filtering now happens before alignment)
 
 ### Summary Generation
@@ -295,7 +295,7 @@ Outputs go to directory specified by `output_dir` in config. Test outputs: `.tes
 Key outputs per sample:
 - `summary/tables/{sample}/{sample}.charging.cpm.tsv.gz` - CPM-normalized charging counts
 - `summary/tables/{sample}/{sample}.charging_prob.tsv.gz` - Per-read charging probabilities
-- `bam/final/{sample}/{sample}.bam` - Final BAM with CL/CM (charging) and PT (adapter positions) tags
+- `bam/final/{sample}/{sample}.bam` - Final BAM with cl/cm (charging) and pt (adapter positions) tags
 
 Pipeline-level outputs:
 - `squiggy-session.json` - Squiggy session file for loading samples in Positron
diff --git a/workflow/rules/aatrnaseq-charging.smk b/workflow/rules/aatrnaseq-charging.smk
@@ -19,7 +19,7 @@ rule get_cca_trna:
     shell:
         """
     python {params.src}/get_charging_table.py \
-      --tag CL \
+      --tag cl \
       {input.bam} \
       {output.charging_tab}
     """
diff --git a/workflow/rules/aatrnaseq-process.smk b/workflow/rules/aatrnaseq-process.smk
@@ -271,9 +271,9 @@ rule classify_charging_leech:
 
 rule transfer_bam_tags:
     """
-  creates classified bam with MM and ML tags transferred to CM/CL
+  creates classified bam with MM and ML tags transferred to cm/cl
 
-  MM/ML tags from the charging classification are transferred to CM/CL so as not to interfere with
+  MM/ML tags from the charging classification are transferred to cm/cl so as not to interfere with
   base modifications.
   """
     input:
@@ -294,7 +294,7 @@ rule transfer_bam_tags:
         """
     python {params.src}/transfer_tags.py \
       --tags ML MM \
-      --rename ML=CL MM=CM \
+      --rename ML=cl MM=cm \
       --source {input.source_bam} \
       --target {input.target_bam} \
       --output {output.classified_bam}
@@ -306,12 +306,12 @@ rule transfer_bam_tags:
 rule add_adapter_tags:
     """
     Detect adapter positions in reads using parasail alignment
-    and add PT tags (SAM-spec read annotation format) to BAM file.
+    and add pt tags (SAM-spec read annotation format) to BAM file.
 
-    PT tag format: start;end;strand;type|start;end;strand;type
-    Example: PT:Z:0;24;+;5p_adapter|118;135;+;3p_adapter
+    pt tag format: start;end;strand;type|start;end;strand;type
+    Example: pt:Z:0;24;+;5p_adapter|118;135;+;3p_adapter
 
-    This produces the final BAM with all tags: CM/CL (charging) and PT (adapters).
+    This produces the final BAM with all tags: cm/cl (charging) and pt (adapters).
     """
     input:
         bam=rules.transfer_bam_tags.output.classified_bam,
diff --git a/workflow/scripts/add_adapter_tags.py b/workflow/scripts/add_adapter_tags.py
@@ -2,11 +2,11 @@
 """
 Add adapter position tags to BAM file using parasail semi-global alignment.
 
-Uses SAM-spec PT:Z: tag format for read annotations:
-    PT:Z:start;end;strand;type|start;end;strand;type
+Uses SAM-spec pt:Z: tag format for read annotations:
+    pt:Z:start;end;strand;type|start;end;strand;type
 
 Example:
-    PT:Z:0;24;+;5p_adapter|118;135;+;3p_adapter
+    pt:Z:0;24;+;5p_adapter|118;135;+;3p_adapter
 
 Positions are 0-based, relative to the read sequence.
 If an adapter is not found, that annotation is omitted.
@@ -204,9 +204,9 @@ def find_best_3p_adapter(read_seq, adapters, matrix, gap_open, gap_extend, min_s
 
 def format_pt_tag(adapter_5p_result, adapter_3p_result):
     """
-    Format adapter positions as SAM-spec PT tag.
+    Format adapter positions as SAM-spec pt tag.
 
-    Format: PT:Z:start;end;strand;type|start;end;strand;type
+    Format: pt:Z:start;end;strand;type|start;end;strand;type
 
     For 3' adapters with names, the type will be "3p_adapter_<name>"
     (e.g., "3p_adapter_v1" or "3p_adapter_v2").
@@ -333,7 +333,7 @@ def process_bam(
                 # Add PT tag if any adapter found
                 pt_value = format_pt_tag(result_5p, result_3p)
                 if pt_value:
-                    read.set_tag("PT", pt_value, "Z")
+                    read.set_tag("pt", pt_value, "Z")
 
                 outbam.write(read)
 
diff --git a/workflow/scripts/get_charging_table.py b/workflow/scripts/get_charging_table.py
@@ -24,15 +24,24 @@ def extract_tag(bam_file, output_tsv, tag):
         for read in bam.fetch():
             read_id = read.query_name
             reference = read.reference_name if read.reference_name else "*"
-            tag_array = dict(read.tags).get(tag, None)
+            tags_dict = dict(read.tags)
+            tag_raw = tags_dict.get(tag, None)
 
-            # XXX: handle case where there are more than 1 tag value
-            # not clear why this is, but we skip for now as it's a small
-            # number of reads affected
-            if len(tag_array) > 1:
+            # Fallback to uppercase tag for backward compat with older BAMs
+            # TODO: remove fallback once all BAMs have been reprocessed
+            if tag_raw is None and tag.islower():
+                tag_raw = tags_dict.get(tag.upper(), None)
+
+            if tag_raw is None:
                 continue
 
-            tag_value = tag_array[0]
+            # Handle both scalar (cl:i:200) and array (CL:B:C:200) tag values
+            if hasattr(tag_raw, "__len__") and not isinstance(tag_raw, str):
+                if len(tag_raw) > 1:
+                    continue
+                tag_value = tag_raw[0]
+            else:
+                tag_value = tag_raw
 
             if tag_value and reference != "*":
                 writer.writerow([read_id, reference, tag_value])
diff --git a/workflow/scripts/transfer_tags.py b/workflow/scripts/transfer_tags.py
@@ -63,6 +63,10 @@ def transfer_tags(
 
             if read_tags:
                 for tag, tag_val in read_tags.items():
+                    # Unwrap single-element arrays to scalar values
+                    # (e.g., ML:B:C:200 → cl:i:200)
+                    if hasattr(tag_val, "__len__") and not isinstance(tag_val, str) and len(tag_val) == 1:
+                        tag_val = tag_val[0]
                     if tag in renamed_tags:
                         read.set_tag(renamed_tags[tag], tag_val)
                     else: