feat: don't rely on user supplied target metadata for transcript selection

bencap · bencap · commit bc59bbcae7f5 · 2025-12-10T11:26:30.000-08:00
Prior to this change, we relied on the user supplying an appropriate HGNC symbol for their target as their target name. This is no longer required. Instead, transcript selection follows the following algorithm:

1.   Align the target sequence with BLAT.
2.    Fetch transcripts which overlap the aligned region (notably, without an HGNC symbol filter).
3.    Perform transcript selection within each distinct gene. This will either leave us with (a) one transcript in cases where we have no overlapping genes in a region, or, (2) one transcript per gene when multiple genes overlap an aligned region. These will be our candidate transcripts.
4.    If we still have more than one candidate transcript, we compare the similarity of each candidate to the provided target sequence. Select the most similar transcript.
diff --git a/src/dcd_mapping/lookup.py b/src/dcd_mapping/lookup.py
@@ -264,35 +264,30 @@ async def get_protein_accession(transcript: str) -> str | None:
 
 
 async def get_transcripts(
-    gene_symbol: str, chromosome_ac: str, start: int, end: int
-) -> list[str]:
-    """Get transcript accessions matching given parameters (excluding non-coding RNA).
+    chromosome_ac: str, start: int, end: int
+) -> list[tuple[str, str]]:
+    """Get transcript accessions matching given parameters (excluding non-coding RNA),
+    returning both the transcript accession and HGNC symbol.
 
-    TODO: may be able to successfully query with only one of gene symbol/chromosome ac.
-    In initial testing, gene symbol doesn't seem to be a meaningful filter, but should
-    get further confirmation.
-
-    :param gene_symbol: HGNC-given gene symbol (usually, but not always, equivalent to
-        symbols available in other nomenclatures.)
     :param chromosome: chromosome accession (e.g. ``"NC_000007.13"``)
     :param start: starting position
     :param end: ending position
-    :return: candidate transcript accessions
+    :return: candidate transcript accessions and HGNC symbols
     """
     try:
         uta = CoolSeqToolBuilder().uta_db
         query = f"""
-        SELECT tx_ac
+        SELECT tx_ac, hgnc
         FROM {uta.schema}.tx_exon_aln_v
-        WHERE hgnc = '{gene_symbol}'
-        AND ({start} BETWEEN alt_start_i AND alt_end_i OR {end} BETWEEN alt_start_i AND alt_end_i)
+        WHERE ({start} BETWEEN alt_start_i AND alt_end_i OR {end} BETWEEN alt_start_i AND alt_end_i)
         AND alt_ac = '{chromosome_ac}'
         AND tx_ac NOT LIKE 'NR_%';
         """  # noqa: S608
         result = await uta.execute_query(query)
     except Exception as e:
         raise DataLookupError from e
-    return [row["tx_ac"] for row in result]
+
+    return [(row["tx_ac"], row["hgnc"]) for row in result]
 
 
 # ------------------------------ Gene Normalizer ------------------------------ #
@@ -596,7 +591,7 @@ def translate_hgvs_to_vrs(hgvs: str) -> Allele:
 # ----------------------------------- MANE ----------------------------------- #
 
 
-def get_mane_transcripts(transcripts: set[str]) -> list[ManeDescription]:
+def get_mane_transcripts(transcripts: list[str]) -> list[ManeDescription]:
     """Get corresponding MANE data for transcripts. Results given in order of
     transcript preference.
 
diff --git a/src/dcd_mapping/transcripts.py b/src/dcd_mapping/transcripts.py
@@ -1,6 +1,7 @@
 """Select best reference sequence."""
 import logging
 import re
+from difflib import SequenceMatcher
 
 from Bio.Data.CodonTable import IUPACData
 from Bio.Seq import Seq
@@ -10,7 +11,6 @@
 from dcd_mapping.exceptions import TxSelectError
 from dcd_mapping.lookup import (
     get_chromosome_identifier,
-    get_gene_symbol,
     get_mane_transcripts,
     get_protein_accession,
     get_seqrepo,
@@ -36,35 +36,80 @@
 
 
 async def _get_compatible_transcripts(
-    target_gene: TargetGene, align_result: AlignmentResult
-) -> set[str]:
-    """Acquire transcripts which overlap with all hit subranges
+    align_result: AlignmentResult,
+) -> set[tuple[str, str]]:
+    """Acquire transcripts and their HGNC symbols which overlap with all hit subranges
     of an alignment result.
 
     :param metadata: metadata for scoreset
     :param align_result: output of ``align()`` method
     :return: Set of compatible transcripts
     """
-    if align_result.chrom.startswith("chr"):
-        aligned_chrom = align_result.chrom[3:]
-    else:
-        aligned_chrom = align_result.chrom
+    aligned_chrom = (
+        align_result.chrom[3:]
+        if align_result.chrom.startswith("chr")
+        else align_result.chrom
+    )
     chromosome = get_chromosome_identifier(aligned_chrom)
-    gene_symbol = get_gene_symbol(target_gene)
-    if not gene_symbol:
-        msg = (
-            f"Unable to find gene symbol for target gene {target_gene.target_gene_name}"
-        )
-        raise TxSelectError(msg)
-    transcript_matches: set[str] = set()
+
+    transcript_matches: set[tuple[str, str]] = set()
     for hit_range in align_result.hit_subranges:
-        matches_list = await get_transcripts(
-            gene_symbol, chromosome, hit_range.start, hit_range.end
-        )
+        matches_list = await get_transcripts(chromosome, hit_range.start, hit_range.end)
+        if not transcript_matches:
+            transcript_matches = set(matches_list)
+
         transcript_matches.intersection_update(matches_list)
+
     return transcript_matches
 
 
+def _percent_similarity(a: str, b: str) -> float:
+    """Compute a simple normalized similarity between two sequences.
+
+    Uses substring check (perfect local match) as a fast path; otherwise falls
+    back to difflib's `SequenceMatcher` ratio which is robust for short strings
+    and small edits.
+
+    :param a: query sequence (typically the provided target protein sequence)
+    :param b: reference sequence (transcript protein sequence)
+    :return: similarity in [0.0, 1.0]
+    """
+    if not a or not b:
+        return 0.0
+    if a == b:
+        return 1.0
+    # If query is fully contained in reference, treat as perfect local match
+    if a in b:
+        return 1.0
+    # Otherwise, compute a normalized similarity
+    return SequenceMatcher(None, a, b).ratio()
+
+
+def _choose_most_similar_transcript(
+    protein_sequence: str, mane_transcripts: list[TranscriptDescription]
+) -> TranscriptDescription | None:
+    """Choose the transcript whose protein reference is most similar to the
+    provided sequence.
+
+    Selects the highest similarity; ties keep first encountered (stable).
+    """
+    if not mane_transcripts:
+        return None
+    if len(mane_transcripts) == 1:
+        return mane_transcripts[0]
+
+    best: TranscriptDescription | None = None
+    best_score = -1.0
+    for tx in mane_transcripts:
+        ref_seq = get_sequence(tx.refseq_prot)
+        score = _percent_similarity(protein_sequence, ref_seq)
+        if score > best_score:
+            best_score = score
+            best = tx
+
+    return best
+
+
 def _choose_best_mane_transcript(
     mane_transcripts: list[ManeDescription],
 ) -> ManeDescription | None:
@@ -143,46 +188,77 @@ async def _select_protein_reference(
     :raise TxSelectError: if no matching MANE transcripts and unable to get UniProt ID/
     reference sequence
     """
-    matching_transcripts = await _get_compatible_transcripts(target_gene, align_result)
+    matching_transcripts = await _get_compatible_transcripts(align_result)
 
-    if not matching_transcripts:
-        if not target_gene.target_uniprot_ref:
-            msg = f"Unable to find matching transcripts for target gene {target_gene.target_gene_name}"
-            raise TxSelectError(msg)
-        protein_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id)
-        np_accession = target_gene.target_uniprot_ref.id
-        ref_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id)
-        if not ref_sequence:
-            msg = f"Unable to grab reference sequence from uniprot.org for target gene {target_gene.target_gene_name}"
-            raise TxSelectError(msg)
-        nm_accession = None
-        tx_mode = None
-    else:
-        mane_transcripts = get_mane_transcripts(matching_transcripts)
+    # Map HGNC symbols to their compatible transcripts
+    hgnc_to_transcripts: dict[str, list[str]] = {}
+    for tx, hgnc in matching_transcripts:
+        hgnc_to_transcripts.setdefault(hgnc, []).append(tx)
+
+    per_gene_best: list[ManeDescription | TranscriptDescription] = []
+    best_tx: ManeDescription | TranscriptDescription | None = None
+
+    # Choose one best transcript per gene (based on MANE priority, falling back to longest)
+    for _, transcripts in hgnc_to_transcripts.items():
+        if not transcripts:
+            continue
+
+        mane_transcripts = get_mane_transcripts(transcripts)
         best_tx = _choose_best_mane_transcript(mane_transcripts)
+
         if not best_tx:
-            best_tx = await _get_longest_compatible_transcript(
-                list(matching_transcripts)
-            )
-        if not best_tx:
-            msg = f"Unable to find matching MANE transcripts for target gene {target_gene.target_gene_name}"
+            best_tx = await _get_longest_compatible_transcript(transcripts)
+
+        if best_tx:
+            per_gene_best.append(best_tx)
+
+    # If we found any per-gene best candidates, Step 2: choose the most similar among them and
+    # select it.
+    if per_gene_best:
+        if not target_gene.target_sequence:
+            msg = f"Unable to find target sequence for target gene {target_gene.target_gene_name}"
             raise TxSelectError(msg)
+
+        protein_sequence = _get_protein_sequence(target_gene.target_sequence)
+        best_tx = _choose_most_similar_transcript(protein_sequence, per_gene_best)
+
+        # As a fallback, pick the first candidate
+        if not best_tx:
+            best_tx = per_gene_best[0]
+
         ref_sequence = get_sequence(best_tx.refseq_prot)
-        nm_accession = best_tx.refseq_nuc
-        np_accession = best_tx.refseq_prot
-        tx_mode = best_tx.transcript_priority
+        is_full_match = ref_sequence.find(protein_sequence) != -1
+        start = ref_sequence.find(protein_sequence[:10])
+
+        return TxSelectResult(
+            nm=best_tx.refseq_nuc,
+            np=best_tx.refseq_prot,
+            start=start,
+            is_full_match=is_full_match,
+            sequence=get_sequence(best_tx.refseq_prot),
+            transcript_mode=best_tx.transcript_priority,
+        )
 
-    protein_sequence = _get_protein_sequence(target_gene.target_sequence)
-    is_full_match = ref_sequence.find(protein_sequence) != -1
-    start = ref_sequence.find(protein_sequence[:10])
+    # If we didn't find any suitable transcript, attempt to use a provided UniProt reference
+    if not target_gene.target_uniprot_ref:
+        msg = f"Unable to find matching transcripts for target gene {target_gene.target_gene_name}"
+        raise TxSelectError(msg)
+
+    uniprot_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id)
+    if not uniprot_sequence:
+        msg = f"Unable to grab reference sequence from uniprot.org for target gene {target_gene.target_gene_name}"
+        raise TxSelectError(msg)
+
+    is_full_match = uniprot_sequence.find(protein_sequence) != -1
+    start = uniprot_sequence.find(protein_sequence[:10])
 
     return TxSelectResult(
-        nm=nm_accession,
-        np=np_accession,
+        nm=None,
+        np=target_gene.target_uniprot_ref.id,
         start=start,
         is_full_match=is_full_match,
         sequence=protein_sequence,
-        transcript_mode=tx_mode,
+        transcript_mode=None,
     )
 
 
diff --git a/tests/test_transcript.py b/tests/test_transcript.py