Skip to content

Commit fed9ed1

Browse files
committed
Refactor transcript selection
1 parent b43065c commit fed9ed1

File tree

1 file changed

+43
-55
lines changed

1 file changed

+43
-55
lines changed

src/cool_seq_tool/mappers/exon_genomic_coords.py

Lines changed: 43 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -805,36 +805,9 @@ async def _genomic_to_tx_segment(
805805
):
806806
transcript = mane_transcripts[0]["RefSeq_nuc"]
807807
else:
808-
# Attempt to find a coding transcript if a MANE transcript
809-
# cannot be found
810-
results = (
811-
await self.mane_transcript.get_longest_compatible_transcript(
812-
start_pos=genomic_pos,
813-
end_pos=genomic_pos,
814-
start_annotation_layer=AnnotationLayer.GENOMIC,
815-
gene=gene,
816-
)
808+
transcript = await self._select_optimal_transcript(
809+
genomic_pos, genomic_ac, gene
817810
)
818-
if results:
819-
transcript = results.refseq
820-
else:
821-
# Run if gene is for a noncoding transcript
822-
query = f"""
823-
SELECT DISTINCT tx_ac
824-
FROM {self.uta_db.schema}.tx_exon_aln_v
825-
WHERE hgnc = '{gene}'
826-
AND alt_ac = '{genomic_ac}'
827-
""" # noqa: S608
828-
result = await self.uta_db.execute_query(query)
829-
830-
if result:
831-
transcript = result[0]["tx_ac"]
832-
else:
833-
return GenomicTxSeg(
834-
errors=[
835-
f"Could not find a transcript for {gene} on {genomic_ac}"
836-
]
837-
)
838811
tx_exons = await self._get_all_exon_coords(
839812
tx_ac=transcript, genomic_ac=genomic_ac
840813
)
@@ -942,6 +915,45 @@ async def _genomic_to_tx_segment(
942915
genomic_ac, genomic_pos, is_seg_start, gene, tx_ac=transcript
943916
)
944917

918+
async def _select_optimal_transcript(
919+
self, genomic_pos: int, genomic_ac: str, gene: str
920+
) -> str:
921+
"""Select the optimal transcript given a genomic position, accession, and gene.
922+
923+
:param genomic_pos: Genomic position where the transcript segment starts or ends
924+
(inter-residue based)
925+
:param genomic_ac: Genomic accession
926+
:param gene: Valid, case-sensitive HGNC gene symbol
927+
:return A string representing the optimal transcript given the above data
928+
"""
929+
# Attempt to find a coding transcript if a MANE transcript cannot be found
930+
transcript = None
931+
results = await self.mane_transcript.get_longest_compatible_transcript(
932+
start_pos=genomic_pos,
933+
end_pos=genomic_pos,
934+
start_annotation_layer=AnnotationLayer.GENOMIC,
935+
gene=gene,
936+
)
937+
if results:
938+
transcript = results.refseq
939+
else:
940+
# Run if gene is for a noncoding transcript
941+
query = f"""
942+
SELECT DISTINCT tx_ac
943+
FROM {self.uta_db.schema}.tx_exon_aln_v
944+
WHERE hgnc = '{gene}'
945+
AND alt_ac = '{genomic_ac}'
946+
""" # noqa: S608
947+
result = await self.uta_db.execute_query(query)
948+
949+
if result:
950+
transcript = result[0]["tx_ac"]
951+
else:
952+
return GenomicTxSeg(
953+
errors=[f"Could not find a transcript for {gene} on {genomic_ac}"]
954+
)
955+
return transcript
956+
945957
async def _get_grch38_ac_pos(
946958
self, genomic_ac: str, genomic_pos: int, grch38_ac: str | None = None
947959
) -> tuple[str | None, int | None, str | None]:
@@ -1089,33 +1101,9 @@ async def _get_tx_seg_genomic_metadata(
10891101
err_msg += f" on gene {gene}"
10901102
_logger.warning(err_msg)
10911103
if not await self.uta_db.validate_mane_transcript_acc(mane_data):
1092-
results = await self.mane_transcript.get_longest_compatible_transcript(
1093-
start_pos=genomic_pos,
1094-
end_pos=genomic_pos,
1095-
start_annotation_layer=AnnotationLayer.GENOMIC,
1096-
gene=gene,
1104+
non_mane_transcript = await self._select_optimal_transcript(
1105+
genomic_pos, genomic_ac, gene
10971106
)
1098-
if results:
1099-
non_mane_transcript = results.refseq
1100-
else:
1101-
# Run if gene is for a noncoding transcript
1102-
query = f"""
1103-
SELECT DISTINCT tx_ac
1104-
FROM {self.uta_db.schema}.tx_exon_aln_v
1105-
WHERE hgnc = '{gene}'
1106-
AND alt_ac = '{genomic_ac}'
1107-
""" # noqa: S608
1108-
result = await self.uta_db.execute_query(query)
1109-
1110-
if result:
1111-
non_mane_transcript = result[0]["tx_ac"]
1112-
else:
1113-
return GenomicTxSeg(
1114-
errors=[
1115-
f"Could not find a transcript for {gene} on {genomic_ac}"
1116-
]
1117-
)
1118-
return GenomicTxSeg(errors=[err_msg])
11191107

11201108
if non_mane_transcript:
11211109
tx_ac = non_mane_transcript

0 commit comments

Comments
 (0)