Skip to content
43 changes: 43 additions & 0 deletions src/cool_seq_tool/mappers/exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,18 @@ async def _genomic_to_tx_segment(
if use_alt_start_i and coordinate_type == CoordinateType.RESIDUE:
genomic_pos = genomic_pos - 1 # Convert residue coordinate to inter-residue

# Validate that the breakpoint occurs within 150 bp of the first and last exon for the selected transcript.
# A breakpoint beyond this range is likely erroneous.
coordinate_check = await self._validate_genomic_breakpoint(
pos=genomic_pos, genomic_ac=genomic_ac, tx_ac=transcript
)
if not coordinate_check:
return GenomicTxSeg(
errors=[
f"{genomic_pos} on {genomic_ac} does not occur within the exons for {transcript}"
]
)

# Check if breakpoint occurs on an exon.
# If not, determine the adjacent exon given the selected transcript
if not self._is_exonic_breakpoint(genomic_pos, tx_exons):
Expand Down Expand Up @@ -932,6 +944,37 @@ async def _get_grch38_pos(
)
return liftover_data[1] if liftover_data else None

async def _validate_genomic_breakpoint(
self,
pos: int,
genomic_ac: str,
tx_ac: str,
) -> bool:
"""Validate that a genomic coordinate falls within the first and last exon
for a transcript on a given accession

:param pos: Genomic position on ``genomic_ac``
:param genomic_ac: RefSeq genomic accession, e.g. ``"NC_000007.14"``
:param transcript: A transcript accession
:return: ``True`` if the coordinate falls within 150bp of the first and last exon
for the transcript, ``False`` if not. Breakpoints past this threshold
are likely erroneous.
"""
query = f"""
WITH tx_boundaries AS (
SELECT
MIN(alt_start_i) AS min_start,
MAX(alt_end_i) AS max_end
FROM {self.uta_db.schema}.tx_exon_aln_v
WHERE tx_ac = '{tx_ac}'
AND alt_ac = '{genomic_ac}'
)
SELECT * FROM tx_boundaries
WHERE {pos} between (tx_boundaries.min_start - 150) and (tx_boundaries.max_end + 150)
""" # noqa: S608
results = await self.uta_db.execute_query(query)
return bool(results)

async def _get_tx_ac_gene(
self,
tx_ac: str,
Expand Down
12 changes: 12 additions & 0 deletions tests/mappers/test_exon_genomic_coords.py
Original file line number Diff line number Diff line change
Expand Up @@ -1507,6 +1507,18 @@ async def test_invalid(test_egc_mapper):
genomic_tx_seg_service_checks(resp, is_valid=False)
assert resp.errors == ["Genomic accession does not exist in UTA: NC_000035.200"]

# Invalid coordinates
resp = await test_egc_mapper.genomic_to_tx_segment(
genomic_ac="NC_000001.11",
seg_start_genomic=9999999999998,
seg_end_genomic=9999999999999,
transcript="NM_152263.3",
)
genomic_tx_seg_service_checks(resp, is_valid=False)
assert resp.errors == [
"9999999999998 on NC_000001.11 does not occur within the exons for NM_152263.3"
]

# Must supply either gene or transcript
resp = await test_egc_mapper.genomic_to_tx_segment(
seg_start_genomic=154191901, genomic_ac="NC_000001.11"
Expand Down