Skip to content
47 changes: 47 additions & 0 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,22 @@ async def _genomic_to_tx_segment(
if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE:
genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue

# Validate that the breakpoint occurs within 150 bp of the first and last exon for the selected transcript.
# A breakpoint beyond this range is likely erroneous.
coordinate_check = await self._validate_genomic_breakpoint(
pos=genomic_pos, genomic_ac=genomic_ac, tx_ac=transcript
)
if not coordinate_check:
msg = (
f"{genomic_pos} on {genomic_ac} occurs more than 150 bp outside the "
f"exon boundaries of the {transcript} transcript, indicating this may not "
f"be a chimeric transcript junction and is unlikely to represent a "
f"contiguous coding sequence. Confirm that the genomic position "
f"{genomic_pos} is being used to represent transcript junction and not "
f"DNA breakpoint."
)
_logger.warning(msg)

# Check if breakpoint occurs on an exon.
# If not, determine the adjacent exon given the selected transcript
if not self._is_exonic_breakpoint(genomic_pos, tx_exons):
Expand Down Expand Up @@ -932,6 +948,37 @@ async def _get_grch38_pos(
)
return liftover_data[1] if liftover_data else None

async def _validate_genomic_breakpoint(
self,
pos: int,
genomic_ac: str,
tx_ac: str,
) -> bool:
"""Validate that a genomic coordinate falls within the first and last exon
for a transcript on a given accession

:param pos: Genomic position on ``genomic_ac``
:param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"``
:param transcript: A transcript accession
:return: ``True`` if the coordinate falls within 150bp of the first and last exon
for the transcript, ``False`` if not. Breakpoints past this threshold
are likely erroneous.
"""
query = f"""
WITH tx_boundaries AS (
SELECT
MIN(alt_start_i) AS min_start,
MAX(alt_end_i) AS max_end
FROM {self.uta_db.schema}.tx_exon_aln_v
WHERE tx_ac = '{tx_ac}'
AND alt_ac = '{genomic_ac}'
)
SELECT * FROM tx_boundaries
WHERE {pos} between (tx_boundaries.min_start - 150) and (tx_boundaries.max_end + 150)
""" # noqa: S608
results = await self.uta_db.execute_query(query)
return bool(results)

async def _get_tx_ac_gene(
self,
tx_ac: str,
Expand Down
34 changes: 33 additions & 1 deletion tests/mappers/test_exon_genomic_coords.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Module for testing that Cool Seq Tool works correctly."""

import logging
from datetime import datetime

import pytest
Expand Down Expand Up @@ -1463,7 +1464,7 @@ async def test_valid_inputs(test_egc_mapper, eln_grch38_intronic):


@pytest.mark.asyncio
async def test_invalid(test_egc_mapper):
async def test_invalid(test_egc_mapper, caplog):
"""Test that invalid queries return `None`."""
resp = await test_egc_mapper.genomic_to_tx_segment(
transcript="NM_152263 3",
Expand Down Expand Up @@ -1514,6 +1515,37 @@ async def test_invalid(test_egc_mapper):
genomic_tx_seg_service_checks(resp, is_valid=False)
assert resp.errors == ["Must provide either `gene` or `transcript`"]

# Check 150 bp warning statement
with caplog.at_level(logging.WARNING):
await test_egc_mapper.genomic_to_tx_segment(
genomic_ac="NC_000001.11",
seg_start_genomic=9999999999998,
seg_end_genomic=9999999999999,
transcript="NM_152263.3",
)
assert any(
(
"9999999999998 on NC_000001.11 occurs more than 150 bp outside the exon "
"boundaries of the NM_152263.3 transcript, indicating this may not be "
"a chimeric transcript junction and is unlikely to represent a "
"contiguous coding sequence. Confirm that the genomic position 9999999999998 is "
"being used to represent transcript junction and not DNA breakpoint."
)
in record.message
for record in caplog.records
)
assert any(
(
"9999999999999 on NC_000001.11 occurs more than 150 bp outside the exon "
"boundaries of the NM_152263.3 transcript, indicating this may not be "
"a chimeric transcript junction and is unlikely to represent a "
"contiguous coding sequence. Confirm that the genomic position 9999999999999 is "
"being used to represent transcript junction and not DNA breakpoint."
)
in record.message
for record in caplog.records
)

# Exon 22 does not exist
resp = await test_egc_mapper.tx_segment_to_genomic(
exon_start=None,
Expand Down