Skip to content

Commit bc59bbc

Browse files
committed
feat: don't rely on user supplied target metadata for transcript selection
Prior to this change, we relied on the user supplying an appropriate HGNC symbol for their target as their target name. This is no longer required. Instead, transcript selection follows the following algorithm: 1. Align the target sequence with BLAT. 2. Fetch transcripts which overlap the aligned region (notably, without an HGNC symbol filter). 3. Perform transcript selection within each distinct gene. This will either leave us with (a) one transcript in cases where we have no overlapping genes in a region, or, (2) one transcript per gene when multiple genes overlap an aligned region. These will be our candidate transcripts. 4. If we still have more than one candidate transcript, we compare the similarity of each candidate to the provided target sequence. Select the most similar transcript.
1 parent e421e82 commit bc59bbc

File tree

3 files changed

+292
-64
lines changed

3 files changed

+292
-64
lines changed

src/dcd_mapping/lookup.py

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -264,35 +264,30 @@ async def get_protein_accession(transcript: str) -> str | None:
264264

265265

266266
async def get_transcripts(
267-
gene_symbol: str, chromosome_ac: str, start: int, end: int
268-
) -> list[str]:
269-
"""Get transcript accessions matching given parameters (excluding non-coding RNA).
267+
chromosome_ac: str, start: int, end: int
268+
) -> list[tuple[str, str]]:
269+
"""Get transcript accessions matching given parameters (excluding non-coding RNA),
270+
returning both the transcript accession and HGNC symbol.
270271
271-
TODO: may be able to successfully query with only one of gene symbol/chromosome ac.
272-
In initial testing, gene symbol doesn't seem to be a meaningful filter, but should
273-
get further confirmation.
274-
275-
:param gene_symbol: HGNC-given gene symbol (usually, but not always, equivalent to
276-
symbols available in other nomenclatures.)
277272
:param chromosome: chromosome accession (e.g. ``"NC_000007.13"``)
278273
:param start: starting position
279274
:param end: ending position
280-
:return: candidate transcript accessions
275+
:return: candidate transcript accessions and HGNC symbols
281276
"""
282277
try:
283278
uta = CoolSeqToolBuilder().uta_db
284279
query = f"""
285-
SELECT tx_ac
280+
SELECT tx_ac, hgnc
286281
FROM {uta.schema}.tx_exon_aln_v
287-
WHERE hgnc = '{gene_symbol}'
288-
AND ({start} BETWEEN alt_start_i AND alt_end_i OR {end} BETWEEN alt_start_i AND alt_end_i)
282+
WHERE ({start} BETWEEN alt_start_i AND alt_end_i OR {end} BETWEEN alt_start_i AND alt_end_i)
289283
AND alt_ac = '{chromosome_ac}'
290284
AND tx_ac NOT LIKE 'NR_%';
291285
""" # noqa: S608
292286
result = await uta.execute_query(query)
293287
except Exception as e:
294288
raise DataLookupError from e
295-
return [row["tx_ac"] for row in result]
289+
290+
return [(row["tx_ac"], row["hgnc"]) for row in result]
296291

297292

298293
# ------------------------------ Gene Normalizer ------------------------------ #
@@ -596,7 +591,7 @@ def translate_hgvs_to_vrs(hgvs: str) -> Allele:
596591
# ----------------------------------- MANE ----------------------------------- #
597592

598593

599-
def get_mane_transcripts(transcripts: set[str]) -> list[ManeDescription]:
594+
def get_mane_transcripts(transcripts: list[str]) -> list[ManeDescription]:
600595
"""Get corresponding MANE data for transcripts. Results given in order of
601596
transcript preference.
602597

src/dcd_mapping/transcripts.py

Lines changed: 123 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Select best reference sequence."""
22
import logging
33
import re
4+
from difflib import SequenceMatcher
45

56
from Bio.Data.CodonTable import IUPACData
67
from Bio.Seq import Seq
@@ -10,7 +11,6 @@
1011
from dcd_mapping.exceptions import TxSelectError
1112
from dcd_mapping.lookup import (
1213
get_chromosome_identifier,
13-
get_gene_symbol,
1414
get_mane_transcripts,
1515
get_protein_accession,
1616
get_seqrepo,
@@ -36,35 +36,80 @@
3636

3737

3838
async def _get_compatible_transcripts(
39-
target_gene: TargetGene, align_result: AlignmentResult
40-
) -> set[str]:
41-
"""Acquire transcripts which overlap with all hit subranges
39+
align_result: AlignmentResult,
40+
) -> set[tuple[str, str]]:
41+
"""Acquire transcripts and their HGNC symbols which overlap with all hit subranges
4242
of an alignment result.
4343
4444
:param metadata: metadata for scoreset
4545
:param align_result: output of ``align()`` method
4646
:return: Set of compatible transcripts
4747
"""
48-
if align_result.chrom.startswith("chr"):
49-
aligned_chrom = align_result.chrom[3:]
50-
else:
51-
aligned_chrom = align_result.chrom
48+
aligned_chrom = (
49+
align_result.chrom[3:]
50+
if align_result.chrom.startswith("chr")
51+
else align_result.chrom
52+
)
5253
chromosome = get_chromosome_identifier(aligned_chrom)
53-
gene_symbol = get_gene_symbol(target_gene)
54-
if not gene_symbol:
55-
msg = (
56-
f"Unable to find gene symbol for target gene {target_gene.target_gene_name}"
57-
)
58-
raise TxSelectError(msg)
59-
transcript_matches: set[str] = set()
54+
55+
transcript_matches: set[tuple[str, str]] = set()
6056
for hit_range in align_result.hit_subranges:
61-
matches_list = await get_transcripts(
62-
gene_symbol, chromosome, hit_range.start, hit_range.end
63-
)
57+
matches_list = await get_transcripts(chromosome, hit_range.start, hit_range.end)
58+
if not transcript_matches:
59+
transcript_matches = set(matches_list)
60+
6461
transcript_matches.intersection_update(matches_list)
62+
6563
return transcript_matches
6664

6765

66+
def _percent_similarity(a: str, b: str) -> float:
67+
"""Compute a simple normalized similarity between two sequences.
68+
69+
Uses substring check (perfect local match) as a fast path; otherwise falls
70+
back to difflib's `SequenceMatcher` ratio which is robust for short strings
71+
and small edits.
72+
73+
:param a: query sequence (typically the provided target protein sequence)
74+
:param b: reference sequence (transcript protein sequence)
75+
:return: similarity in [0.0, 1.0]
76+
"""
77+
if not a or not b:
78+
return 0.0
79+
if a == b:
80+
return 1.0
81+
# If query is fully contained in reference, treat as perfect local match
82+
if a in b:
83+
return 1.0
84+
# Otherwise, compute a normalized similarity
85+
return SequenceMatcher(None, a, b).ratio()
86+
87+
88+
def _choose_most_similar_transcript(
89+
protein_sequence: str, mane_transcripts: list[TranscriptDescription]
90+
) -> TranscriptDescription | None:
91+
"""Choose the transcript whose protein reference is most similar to the
92+
provided sequence.
93+
94+
Selects the highest similarity; ties keep first encountered (stable).
95+
"""
96+
if not mane_transcripts:
97+
return None
98+
if len(mane_transcripts) == 1:
99+
return mane_transcripts[0]
100+
101+
best: TranscriptDescription | None = None
102+
best_score = -1.0
103+
for tx in mane_transcripts:
104+
ref_seq = get_sequence(tx.refseq_prot)
105+
score = _percent_similarity(protein_sequence, ref_seq)
106+
if score > best_score:
107+
best_score = score
108+
best = tx
109+
110+
return best
111+
112+
68113
def _choose_best_mane_transcript(
69114
mane_transcripts: list[ManeDescription],
70115
) -> ManeDescription | None:
@@ -143,46 +188,77 @@ async def _select_protein_reference(
143188
:raise TxSelectError: if no matching MANE transcripts and unable to get UniProt ID/
144189
reference sequence
145190
"""
146-
matching_transcripts = await _get_compatible_transcripts(target_gene, align_result)
191+
matching_transcripts = await _get_compatible_transcripts(align_result)
147192

148-
if not matching_transcripts:
149-
if not target_gene.target_uniprot_ref:
150-
msg = f"Unable to find matching transcripts for target gene {target_gene.target_gene_name}"
151-
raise TxSelectError(msg)
152-
protein_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id)
153-
np_accession = target_gene.target_uniprot_ref.id
154-
ref_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id)
155-
if not ref_sequence:
156-
msg = f"Unable to grab reference sequence from uniprot.org for target gene {target_gene.target_gene_name}"
157-
raise TxSelectError(msg)
158-
nm_accession = None
159-
tx_mode = None
160-
else:
161-
mane_transcripts = get_mane_transcripts(matching_transcripts)
193+
# Map HGNC symbols to their compatible transcripts
194+
hgnc_to_transcripts: dict[str, list[str]] = {}
195+
for tx, hgnc in matching_transcripts:
196+
hgnc_to_transcripts.setdefault(hgnc, []).append(tx)
197+
198+
per_gene_best: list[ManeDescription | TranscriptDescription] = []
199+
best_tx: ManeDescription | TranscriptDescription | None = None
200+
201+
# Choose one best transcript per gene (based on MANE priority, falling back to longest)
202+
for _, transcripts in hgnc_to_transcripts.items():
203+
if not transcripts:
204+
continue
205+
206+
mane_transcripts = get_mane_transcripts(transcripts)
162207
best_tx = _choose_best_mane_transcript(mane_transcripts)
208+
163209
if not best_tx:
164-
best_tx = await _get_longest_compatible_transcript(
165-
list(matching_transcripts)
166-
)
167-
if not best_tx:
168-
msg = f"Unable to find matching MANE transcripts for target gene {target_gene.target_gene_name}"
210+
best_tx = await _get_longest_compatible_transcript(transcripts)
211+
212+
if best_tx:
213+
per_gene_best.append(best_tx)
214+
215+
# If we found any per-gene best candidates, Step 2: choose the most similar among them and
216+
# select it.
217+
if per_gene_best:
218+
if not target_gene.target_sequence:
219+
msg = f"Unable to find target sequence for target gene {target_gene.target_gene_name}"
169220
raise TxSelectError(msg)
221+
222+
protein_sequence = _get_protein_sequence(target_gene.target_sequence)
223+
best_tx = _choose_most_similar_transcript(protein_sequence, per_gene_best)
224+
225+
# As a fallback, pick the first candidate
226+
if not best_tx:
227+
best_tx = per_gene_best[0]
228+
170229
ref_sequence = get_sequence(best_tx.refseq_prot)
171-
nm_accession = best_tx.refseq_nuc
172-
np_accession = best_tx.refseq_prot
173-
tx_mode = best_tx.transcript_priority
230+
is_full_match = ref_sequence.find(protein_sequence) != -1
231+
start = ref_sequence.find(protein_sequence[:10])
232+
233+
return TxSelectResult(
234+
nm=best_tx.refseq_nuc,
235+
np=best_tx.refseq_prot,
236+
start=start,
237+
is_full_match=is_full_match,
238+
sequence=get_sequence(best_tx.refseq_prot),
239+
transcript_mode=best_tx.transcript_priority,
240+
)
174241

175-
protein_sequence = _get_protein_sequence(target_gene.target_sequence)
176-
is_full_match = ref_sequence.find(protein_sequence) != -1
177-
start = ref_sequence.find(protein_sequence[:10])
242+
# If we didn't find any suitable transcript, attempt to use a provided UniProt reference
243+
if not target_gene.target_uniprot_ref:
244+
msg = f"Unable to find matching transcripts for target gene {target_gene.target_gene_name}"
245+
raise TxSelectError(msg)
246+
247+
uniprot_sequence = get_uniprot_sequence(target_gene.target_uniprot_ref.id)
248+
if not uniprot_sequence:
249+
msg = f"Unable to grab reference sequence from uniprot.org for target gene {target_gene.target_gene_name}"
250+
raise TxSelectError(msg)
251+
252+
is_full_match = uniprot_sequence.find(protein_sequence) != -1
253+
start = uniprot_sequence.find(protein_sequence[:10])
178254

179255
return TxSelectResult(
180-
nm=nm_accession,
181-
np=np_accession,
256+
nm=None,
257+
np=target_gene.target_uniprot_ref.id,
182258
start=start,
183259
is_full_match=is_full_match,
184260
sequence=protein_sequence,
185-
transcript_mode=tx_mode,
261+
transcript_mode=None,
186262
)
187263

188264

0 commit comments

Comments
 (0)