Skip to content

Commit 8cafe54

Browse files
authored
fix!: Edit logic in genomic breakpoint validator in exon_genomic_coords.py (#406)
1 parent cb55840 commit 8cafe54

File tree

2 files changed

+80
-1
lines changed

2 files changed

+80
-1
lines changed

src/cool_seq_tool/mappers/exon_genomic_coords.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -865,6 +865,22 @@ async def _genomic_to_tx_segment(
865865
if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE:
866866
genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue
867867

868+
# Validate that the breakpoint occurs within 150 bp of the first and last exon for the selected transcript.
869+
# A breakpoint beyond this range is likely erroneous.
870+
coordinate_check = await self._validate_genomic_breakpoint(
871+
pos=genomic_pos, genomic_ac=genomic_ac, tx_ac=transcript
872+
)
873+
if not coordinate_check:
874+
msg = (
875+
f"{genomic_pos} on {genomic_ac} occurs more than 150 bp outside the "
876+
f"exon boundaries of the {transcript} transcript, indicating this may not "
877+
f"be a chimeric transcript junction and is unlikely to represent a "
878+
f"contiguous coding sequence. Confirm that the genomic position "
879+
f"{genomic_pos} is being used to represent transcript junction and not "
880+
f"DNA breakpoint."
881+
)
882+
_logger.warning(msg)
883+
868884
# Check if breakpoint occurs on an exon.
869885
# If not, determine the adjacent exon given the selected transcript
870886
if not self._is_exonic_breakpoint(genomic_pos, tx_exons):
@@ -932,6 +948,37 @@ async def _get_grch38_pos(
932948
)
933949
return liftover_data[1] if liftover_data else None
934950

951+
async def _validate_genomic_breakpoint(
952+
self,
953+
pos: int,
954+
genomic_ac: str,
955+
tx_ac: str,
956+
) -> bool:
957+
"""Validate that a genomic coordinate falls within the first and last exon
958+
for a transcript on a given accession
959+
960+
:param pos: Genomic position on ``genomic_ac``
961+
:param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"``
962+
:param transcript: A transcript accession
963+
:return: ``True`` if the coordinate falls within 150bp of the first and last exon
964+
for the transcript, ``False`` if not. Breakpoints past this threshold
965+
are likely erroneous.
966+
"""
967+
query = f"""
968+
WITH tx_boundaries AS (
969+
SELECT
970+
MIN(alt_start_i) AS min_start,
971+
MAX(alt_end_i) AS max_end
972+
FROM {self.uta_db.schema}.tx_exon_aln_v
973+
WHERE tx_ac = '{tx_ac}'
974+
AND alt_ac = '{genomic_ac}'
975+
)
976+
SELECT * FROM tx_boundaries
977+
WHERE {pos} between (tx_boundaries.min_start - 150) and (tx_boundaries.max_end + 150)
978+
""" # noqa: S608
979+
results = await self.uta_db.execute_query(query)
980+
return bool(results)
981+
935982
async def _get_tx_ac_gene(
936983
self,
937984
tx_ac: str,

tests/mappers/test_exon_genomic_coords.py

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""Module for testing that Cool Seq Tool works correctly."""
22

3+
import logging
34
from datetime import datetime
45

56
import pytest
@@ -1463,7 +1464,7 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic):
14631464

14641465

14651466
@pytest.mark.asyncio
1466-
async def test_invalid(test_egc_mapper):
1467+
async def test_invalid(test_egc_mapper, caplog):
14671468
"""Test that invalid queries return `None`."""
14681469
resp = await test_egc_mapper.genomic_to_tx_segment(
14691470
transcript="NM_152263 3",
@@ -1514,6 +1515,37 @@ async def test_invalid(test_egc_mapper):
15141515
genomic_tx_seg_service_checks(resp, is_valid=False)
15151516
assert resp.errors == ["Must provide either `gene` or `transcript`"]
15161517

1518+
# Check 150 bp warning statement
1519+
with caplog.at_level(logging.WARNING):
1520+
await test_egc_mapper.genomic_to_tx_segment(
1521+
genomic_ac="NC_000001.11",
1522+
seg_start_genomic=9999999999998,
1523+
seg_end_genomic=9999999999999,
1524+
transcript="NM_152263.3",
1525+
)
1526+
assert any(
1527+
(
1528+
"9999999999998 on NC_000001.11 occurs more than 150 bp outside the exon "
1529+
"boundaries of the NM_152263.3 transcript, indicating this may not be "
1530+
"a chimeric transcript junction and is unlikely to represent a "
1531+
"contiguous coding sequence. Confirm that the genomic position 9999999999998 is "
1532+
"being used to represent transcript junction and not DNA breakpoint."
1533+
)
1534+
in record.message
1535+
for record in caplog.records
1536+
)
1537+
assert any(
1538+
(
1539+
"9999999999999 on NC_000001.11 occurs more than 150 bp outside the exon "
1540+
"boundaries of the NM_152263.3 transcript, indicating this may not be "
1541+
"a chimeric transcript junction and is unlikely to represent a "
1542+
"contiguous coding sequence. Confirm that the genomic position 9999999999999 is "
1543+
"being used to represent transcript junction and not DNA breakpoint."
1544+
)
1545+
in record.message
1546+
for record in caplog.records
1547+
)
1548+
15171549
# Exon 22 does not exist
15181550
resp = await test_egc_mapper.tx_segment_to_genomic(
15191551
exon_start=None,

0 commit comments

Comments
 (0)